In [174]:
# NLTK setup: ensure punkt + punkt_tab are available and path is correct

import os, nltk, sys
from pathlib import Path

# Prefer venv-local nltk_data
venv_dir = Path.cwd() / ".venv" / "nltk_data"
project_dir = Path.cwd() / "nltk_data"
user_dir = Path.home() / "nltk_data"

for p in [venv_dir, project_dir, user_dir]:
    p.mkdir(parents=True, exist_ok=True)
    if str(p) not in nltk.data.path:
        nltk.data.path.insert(0, str(p))

# Download required resources quietly; NLTK>=3.9 needs punkt_tab too
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.download(pkg, quiet=True)
    except Exception as e:
        print(f"Warning: failed to download {pkg}: {e}")

print("NLTK paths:", nltk.data.path)
# Tiny sanity check
try:
    from nltk.tokenize import word_tokenize
    print(word_tokenize("Quick test: tokenization works.", preserve_line=True))
except Exception as e:
    print("Tokenize sanity check failed:", e)

NLTK paths: ['/home/trinhthanh2508/Documents/Sentiment-analysis-using-Twitter-data/nltk_data', '/home/trinhthanh2508/nltk_data', '/home/trinhthanh2508/Documents/Sentiment-analysis-using-Twitter-data/.venv/nltk_data', '/home/trinhthanh2508/Documents/Sentiment-analysis-using-Twitter-data/.venv/share/nltk_data', '/home/trinhthanh2508/Documents/Sentiment-analysis-using-Twitter-data/.venv/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/home/trinhthanh2508/nltk_data', '/home/trinhthanh2508/nltk_data', '/home/trinhthanh2508/nltk_data', './nltk_data', '.venv/nltk_data', '.venv/nltk_data']
['Quick', 'test', ':', 'tokenization', 'works', '.']


In [175]:
import pandas as pd
import numpy as np
import gensim 
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings
warnings.filterwarnings("ignore")

In [176]:
df = pd.read_csv('supervised_sample_datasets/lexicon_step1.csv')
df

Unnamed: 0.1,Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis,vader_score,textblob_polarity,senti_textblob,senti_wordnet,senti_vader
0,0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"['parliament', 'days', 'not', 'weeks', 'challe...","['parliament', 'day', 'not', 'week', 'challeng...",parliament day not week challenge government s...,"[('parliament', 'n'), ('days', 'n'), ('not', '...",parliament day not week challenge government...,-0.625,-0.0572,0.000000,0,-1,-1
1,1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"['first', 'dose', 'syringe', 'combating']","['first', 'dose', 'syringe', 'combating']",first dose syringe combating,"[('first', 'r'), ('dose', 'a'), ('syringe', 'n...",first dose syringe combating,0.000,0.0000,0.250000,1,0,0
2,2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"['time', 'forget', 'sleep', 'listening', 'abcg...","['time', 'forget', 'sleep', 'listening', 'abcg...",time forget sleep listening abcgrandstand via ...,"[('time', 'n'), ('forget', 'v'), ('sleep', 'a'...",time forget sleep listen abcgrandstand via a...,0.250,-0.2263,0.000000,0,1,-1
3,3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"['rachel', 'absolutely', 'nailed', 'tonight', ...","['rachel', 'absolutely', 'nailed', 'tonight', ...",rachel absolutely nailed tonight throughout pr...,"[('rachel', 'n'), ('absolutely', 'r'), ('naile...",rachel absolutely nail tonight throughout pr...,1.375,0.6124,0.300000,1,1,1
4,4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"['kids', 'never', 'say', 'not', 'wana', 'homew...","['kid', 'never', 'say', 'not', 'wana', 'homewo...",kid never say not wana homework got sparkle sp...,"[('kids', 'n'), ('never', 'r'), ('say', 'v'), ...",kid never say not wana homework get sparkle ...,-0.750,0.6573,0.025000,1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29877,29877,2021-01-30,1200125350694576128,MayorJesse,Health Rover will be offering rapid COVID-19 a...,"{'type': 'Feature', 'bbox': [-72.4243, 40.8667...","Southampton, NY",health rover offering rapid antigen tests appo...,"['health', 'rover', 'offering', 'rapid', 'anti...","['health', 'rover', 'offering', 'rapid', 'anti...",health rover offering rapid antigen test appoi...,"[('health', 'n'), ('rover', 'n'), ('offering',...",health rover offering rapid antigen test app...,0.750,0.0000,0.000000,0,1,0
29878,29878,2021-01-27,803004340608843776,thatmikeny,#Americaorbust FOX complicit in so Manet death...,"{'type': 'Feature', 'bbox': [-72.4243, 40.8667...","Southampton, NY",americaorbust fox complicit manet deaths due c...,"['americaorbust', 'fox', 'complicit', 'manet',...","['americaorbust', 'fox', 'complicit', 'manet',...",americaorbust fox complicit manet death due ca...,"[('americaorbust', 'a'), ('fox', 'n'), ('compl...",americaorbust fox complicit manet death due ...,-0.625,-0.8126,-0.125000,-1,-1,-1
29879,29879,2021-01-21,408755761,ConstanceHunter,Here is @POTUS plan to defeat #COVID19. üëáüèª\nSt...,"{'type': 'Feature', 'bbox': [-72.4243, 40.8667...","Southampton, NY",plan defeat backhand index pointing light skin...,"['plan', 'defeat', 'backhand', 'index', 'point...","['plan', 'defeat', 'backhand', 'index', 'point...",plan defeat backhand index pointing light skin...,"[('plan', 'n'), ('defeat', 'v'), ('backhand', ...",plan defeat backhand index point light skin ...,0.000,-0.6369,0.400000,1,0,-1
29880,29880,2021-01-20,1247264861601619969,EBalabanidou,#morethanjustbones seems more relevant than ev...,"{'type': 'Feature', 'bbox': [-1.43332383719785...",Southampton General Hospital,morethanjustbones seems relevant ever feeling ...,"['morethanjustbones', 'seems', 'relevant', 'ev...","['morethanjustbones', 'seems', 'relevant', 'ev...",morethanjustbones seems relevant ever feeling ...,"[('morethanjustbones', 'n'), ('seems', 'v'), (...",morethanjustbones seem relevant ever feel sa...,0.000,0.6597,0.366667,1,0,1


In [177]:
#sample data
positive_df = df[df["senti_vader"] == 1]
positive_df = positive_df[:1000] #1000 positive sentiment
neutral_df = df[df["senti_vader"] == 0]
neutral_df = neutral_df[:1000] #1000 neutral sentiment
negative_df = df[df["senti_vader"] == -1]
negative_df = negative_df[:1000] #1000 neutral sentiment

In [178]:
df=[positive_df, neutral_df, negative_df]
df=pd.concat(df)
df=df.reset_index(drop=True)
df.to_csv('supervised_sample_datasets/sample_data.csv', index=False, encoding='utf_8_sig')

In [179]:
#after manual check
df = pd.read_csv('supervised_sample_datasets/sample_data.csv')

In [180]:
#after manual check
negative_num=len(df[df['senti_vader'] < 0])
print("negative:", negative_num)
neutral_num=len(df[df['senti_vader'] == 0])
print("neutral", neutral_num)
positive_num=len(df[df['senti_vader'] > 0])
print("positive", positive_num) 

negative: 1000
neutral 1000
positive 1000


In [181]:
# df=df.drop('Unnamed: 0', axis=1)

In [182]:
#BoW
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from nltk.tokenize import word_tokenize

def bag_of_words(df):
#     bow_vectorizer = CountVectorizer(max_df=0.90, min_df=0.2, stop_words=None, tokenizer=word_tokenize) 
    bow_vectorizer = CountVectorizer() 
    bow = bow_vectorizer.fit_transform(df['lemma_sentence(with POS)']) 
    #print(bow_vectorizer.get_feature_names())
    #print(bow_vectorizer.vocabulary_)
    return bow

df_bow=bag_of_words(df)
df_bow.shape

(3000, 7371)

In [183]:
#TF-IDF
def tf_idf(df):
#     tf_idf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.2, stop_words=None, tokenizer=word_tokenize, norm='l2') 
    tf_idf_vectorizer = TfidfVectorizer(norm='l2') #extract features
    tfidf = tf_idf_vectorizer.fit_transform(df['lemma_sentence(with POS)']) #vectors
    return tfidf
df_tfidf=tf_idf(df)
df_tfidf.shape

(3000, 7371)

In [184]:
#Word2vec

#referenceÔºöhttps://www.pythonf.cn/read/93491

#https://github.com/Shwetago/Sentiment_Analysis/blob/master/Twitter_Sentiment_Analysis.ipynb

from nltk.tokenize import word_tokenize

# 2Ô∏è‚É£ Ch·∫Øc ch·∫Øn c·ªôt text l√† string
texts = df['lemma_sentence(with POS)'].astype(str)

# 3Ô∏è‚É£ Tokenize t·ª´ng c√¢u
Tokenize_tweet = texts.apply(word_tokenize)
print(Tokenize_tweet.head())

# 4Ô∏è‚É£ Hu·∫•n luy·ªán Word2Vec (gensim 4+ uses vector_size)
Model_W2V = gensim.models.Word2Vec(
    sentences=Tokenize_tweet,   # danh s√°ch token
    vector_size=200,            # thay size b·∫±ng vector_size
    window=5,
    min_count=1,
    sg=1,                       # skip-gram
    hs=0,
    negative=10,
    workers=2,
    seed=34
)
print("Trained Word2Vec, vector_size=", getattr(Model_W2V, "vector_size", 200))

0    [rachel, absolutely, nail, tonight, throughout...
1    [kid, never, say, not, wana, homework, get, sp...
2       [not, mess, wear, mask, time, united, kingdom]
3    [problem, think, airfield, cost, remain, open,...
4    [remain, astonished, stock, market, not, much,...
Name: lemma_sentence(with POS), dtype: object
Trained Word2Vec, vector_size= 200
Trained Word2Vec, vector_size= 200


In [185]:
import numpy as np
import pandas as pd

def word2vec_tweet(tokens, size=200):
    vector = np.zeros(size)
    vector_cnt = 0
    for word in tokens:
        if word in Model_W2V.wv:  # ki·ªÉm tra t·ª´ c√≥ trong vocab
            vector += Model_W2V.wv[word]
            vector_cnt += 1
    if vector_cnt > 0:
        vector /= vector_cnt  # average
    return vector

def word2vec_tweet_2(tokens, size=200):
    vector = np.zeros(size)
    for word in tokens:
        if word in Model_W2V.wv:
            vector += Model_W2V.wv[word]
    return vector  # sum

# t·∫°o ma tr·∫≠n tweet
tweet_arr = np.zeros((len(Tokenize_tweet), 200))
for i in range(len(Tokenize_tweet)):
    tweet_arr[i, :] = word2vec_tweet(Tokenize_tweet[i], 200)

tweet_vec_df = pd.DataFrame(tweet_arr)
tweet_vec_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.088259,-0.259626,0.112625,-0.02411,-0.138272,0.021312,-0.029404,-0.16072,0.215939,0.000346,...,0.247268,-0.071442,-0.039951,-0.086576,0.078897,-0.035102,-0.133019,-0.235957,0.167665,0.249311
1,0.068934,-0.261676,0.121841,-0.050017,-0.150805,0.013265,-0.043143,-0.177324,0.227205,-0.035048,...,0.234007,-0.104785,-0.032998,-0.068815,0.086702,-0.03658,-0.128775,-0.228731,0.187162,0.234819
2,0.040204,-0.295585,0.116661,-0.087856,-0.167294,0.020854,-0.059556,-0.165251,0.235944,-0.031105,...,0.249622,-0.114534,-0.045943,-0.093261,0.106041,-0.022749,-0.152569,-0.24066,0.196297,0.240968
3,0.074916,-0.254385,0.115911,-0.03784,-0.139198,0.016156,-0.037365,-0.165562,0.220569,-0.021031,...,0.231657,-0.088473,-0.038446,-0.076428,0.078788,-0.035862,-0.126689,-0.22291,0.174423,0.233954
4,0.05257,-0.268994,0.123903,-0.066325,-0.158135,0.010322,-0.0511,-0.179018,0.231678,-0.040417,...,0.234394,-0.111053,-0.033067,-0.071318,0.092606,-0.034113,-0.135067,-0.229212,0.193269,0.230624


In [186]:
#BoW for three classification models
#split the train and test datasets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_bow, df['senti_textblob'],test_size = 0.2)

In [187]:
#parameters in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)] #tree number
max_features = ['auto', 'sqrt','log2']
max_depth = [10,20,30,40]
min_samples_split = [2, 5, 10, 15]
min_samples_leaf = [1, 2, 5, 10]

# Create the param grid
param_grid_forest = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(param_grid_forest)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40], 'min_samples_split': [2, 5, 10, 15], 'min_samples_leaf': [1, 2, 5, 10]}


In [188]:
#parameters in MNB
param_grid_nb = {'alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
                'fit_prior':[True, False]}
print(param_grid_nb)

{'alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0], 'fit_prior': [True, False]}


In [189]:
#parameters in SVC
# c_list=list(range(1,51))
param_grid_svc = {'C': [1, 10, 100, 1000],
                  'kernel': ['linear','poly','rbf','sigmoid'],
                  'degree': [1,2,3,4]}
print(param_grid_svc)

{'C': [1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [1, 2, 3, 4]}


In [190]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

model_forest = RandomForestClassifier()
model_nb = MultinomialNB()
model_svc = SVC()

In [191]:
#best parameters for RF(with BoW)
from sklearn.model_selection import RandomizedSearchCV
RF_RandomGrid = RandomizedSearchCV(estimator = model_forest, param_distributions = param_grid_forest, cv = 10, verbose=2, n_jobs = 4)
RF_RandomGrid.fit(x_train, y_train)
RF_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=

[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=90; total time=   0.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=180; total time=   0.8s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=180; total time=   0.8s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=180; total time=   0.8s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=180; total time=   0.8s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=180; total time=   0.8s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=180; total time=   0.8s
[CV] END max_depth=30, max_feature

{'n_estimators': 180,
 'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': 30}

In [192]:
#best parameters for MNB(with BoW)
NB_RandomGrid = RandomizedSearchCV(estimator = model_nb, param_distributions = param_grid_nb, cv = 10, verbose=2, n_jobs = 4)
NB_RandomGrid.fit(x_train, y_train)
NB_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.5, fit_prior=False; total time=   0.0s
[CV] END ..........................alpha=0.1, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.5, f

{'fit_prior': True, 'alpha': 0.5}

In [193]:
#best parameters for SVC(with BoW)
SVC_RandomGrid = RandomizedSearchCV(estimator = model_svc, param_distributions = param_grid_svc, cv = 10, verbose=2, n_jobs = 4)
SVC_RandomGrid.fit(x_train, y_train)
SVC_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.4s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.4s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.3s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.5s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.3s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.5s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.4s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.4s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.4s
[CV] END ...................C=1000, degree=4, kernel=sigmoid; total time=   0.4s
[CV] END ...................

{'kernel': 'poly', 'degree': 1, 'C': 1000}

In [194]:
#model establishment and results(BoW)
#Random Forest
model_forest = RandomForestClassifier(n_estimators=140,min_samples_split=10, min_samples_leaf=2, max_features='sqrt', max_depth=40)
model_forest.fit(x_train,y_train)
prediction = model_forest.predict(x_test)

from sklearn.metrics import classification_report 
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

          -1       0.87      0.25      0.39       159
           0       0.72      0.90      0.80       191
           1       0.70      0.88      0.78       250

    accuracy                           0.72       600
   macro avg       0.76      0.68      0.66       600
weighted avg       0.75      0.72      0.68       600



In [195]:
model_nb = MultinomialNB(alpha=1.0, fit_prior=False)
model_nb = model_nb.fit(x_train,y_train)
prediction_nb = model_nb.predict(x_test)

print(classification_report(y_test, prediction_nb))

              precision    recall  f1-score   support

          -1       0.57      0.63      0.60       159
           0       0.74      0.47      0.58       191
           1       0.64      0.77      0.70       250

    accuracy                           0.64       600
   macro avg       0.65      0.62      0.62       600
weighted avg       0.65      0.64      0.63       600



In [196]:
model_svc = SVC(kernel='linear',degree=1, C=1)
model_svc = model_svc.fit(x_train,y_train)
prediction_svc = model_svc.predict(x_test)

print(classification_report(y_test, prediction_svc))

              precision    recall  f1-score   support

          -1       0.68      0.56      0.61       159
           0       0.71      0.84      0.77       191
           1       0.77      0.74      0.76       250

    accuracy                           0.73       600
   macro avg       0.72      0.72      0.71       600
weighted avg       0.73      0.73      0.72       600



In [197]:
#TF-IDF
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(df_tfidf, df['senti_textblob'],test_size = 0.2)

In [198]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

model_forest = RandomForestClassifier()
model_nb = MultinomialNB()
model_svc = SVC()

In [199]:
#best parameters for RF(with TFIDF)
from sklearn.model_selection import RandomizedSearchCV
RF_RandomGrid = RandomizedSearchCV(estimator = model_forest, param_distributions = param_grid_forest, cv = 10, verbose=2, n_jobs = 4)
RF_RandomGrid.fit(x_train_2, y_train_2)
RF_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 30}

In [200]:
NB_RandomGrid = RandomizedSearchCV(estimator = model_nb, param_distributions = param_grid_nb, cv = 10, verbose=2, n_jobs = 4)
NB_RandomGrid.fit(x_train_2, y_train_2)
NB_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.1, f

{'fit_prior': False, 'alpha': 0.5}

In [201]:
SVC_RandomGrid = RandomizedSearchCV(estimator = model_svc, param_distributions = param_grid_svc, cv = 10, verbose=2, n_jobs = 4)
SVC_RandomGrid.fit(x_train_2, y_train_2)
SVC_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=1, kernel=poly; total time=   0.6s
[CV] END ........................C=10, degree=

{'kernel': 'linear', 'degree': 4, 'C': 1000}

In [202]:
#model establishment and results(TF-IDF)
#Random Forest
model_forest = RandomForestClassifier(n_estimators=120,min_samples_split=5, min_samples_leaf=2, max_features='sqrt', max_depth=40)
#model_forest = RandomForestClassifier()
model_forest.fit(x_train_2,y_train_2)
prediction = model_forest.predict(x_test_2)

from sklearn.metrics import classification_report 
print(classification_report(y_test_2, prediction))

              precision    recall  f1-score   support

          -1       0.84      0.28      0.42       152
           0       0.70      0.79      0.74       198
           1       0.66      0.86      0.75       250

    accuracy                           0.69       600
   macro avg       0.73      0.64      0.64       600
weighted avg       0.72      0.69      0.66       600



In [203]:
model_nb = MultinomialNB(alpha=0.5, fit_prior=False)
model_nb = model_nb.fit(x_train_2,y_train_2)
prediction_nb = model_nb.predict(x_test_2)

print(classification_report(y_test_2, prediction_nb))

              precision    recall  f1-score   support

          -1       0.55      0.56      0.56       152
           0       0.76      0.46      0.58       198
           1       0.62      0.80      0.70       250

    accuracy                           0.63       600
   macro avg       0.64      0.61      0.61       600
weighted avg       0.65      0.63      0.62       600



In [204]:
model_svc = SVC(C=10, kernel='linear',degree=3)
model_svc = model_svc.fit(x_train_2,y_train_2)
prediction_svc = model_svc.predict(x_test_2)

print(classification_report(y_test_2, prediction_svc))

              precision    recall  f1-score   support

          -1       0.58      0.51      0.55       152
           0       0.70      0.76      0.73       198
           1       0.72      0.72      0.72       250

    accuracy                           0.68       600
   macro avg       0.67      0.66      0.66       600
weighted avg       0.68      0.68      0.68       600



In [205]:
#Word2vec
x_train_3, x_test_3, y_train_3, y_test_3 = train_test_split(tweet_vec_df, df['senti_textblob'],test_size = 0.2)

In [206]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

model_forest = RandomForestClassifier()
model_nb = MultinomialNB()
model_svc = SVC()

In [207]:
#best parameters for RF(with Word2vec)
from sklearn.model_selection import RandomizedSearchCV
RF_RandomGrid = RandomizedSearchCV(estimator = model_forest, param_distributions = param_grid_forest, cv = 10, verbose=2, n_jobs = 4)
RF_RandomGrid.fit(x_train_3, y_train_3)
RF_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.5s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_est

{'n_estimators': 70,
 'min_samples_split': 15,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20}

In [208]:
SVC_RandomGrid = RandomizedSearchCV(estimator = model_svc, param_distributions = param_grid_svc, cv = 10, verbose=2, n_jobs = 4)
SVC_RandomGrid.fit(x_train_3, y_train_3)
SVC_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.5s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.5s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.5s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.6s
[CV] END ......................C=1, degree=4, kernel=sigmoid; total time=   0.5s
[CV] END ......................C=1, degree=4, 

{'kernel': 'rbf', 'degree': 3, 'C': 1000}

In [209]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() #handle negative
x_train_3a = scaler.fit_transform(x_train_3)
x_test_3a = scaler.fit_transform(x_test_3)
NB_RandomGrid = RandomizedSearchCV(estimator = model_nb, param_distributions = param_grid_nb, cv = 10, verbose=2, n_jobs = 4)
NB_RandomGrid.fit(x_train_3a, y_train_3)
NB_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.01, 

{'fit_prior': True, 'alpha': 0.01}

In [210]:
#model establishment and results(Word2Vec)
#Random Forest
model_forest = RandomForestClassifier(n_estimators=160,min_samples_split=15, min_samples_leaf=2, max_features='log2', max_depth=30)
model_forest.fit(x_train_3,y_train_3)
prediction = model_forest.predict(x_test_3)

from sklearn.metrics import classification_report 
print(classification_report(y_test_3, prediction))

              precision    recall  f1-score   support

          -1       0.39      0.11      0.17       149
           0       0.56      0.52      0.54       185
           1       0.52      0.75      0.61       266

    accuracy                           0.52       600
   macro avg       0.49      0.46      0.44       600
weighted avg       0.50      0.52      0.48       600



In [None]:
from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# x_train_3 = scaler.fit_transform(x_train_3)
# x_test_3 = scaler.fit_transform(x_test_3)

model_nb = MultinomialNB(alpha=10, fit_prior=False)
model_nb = model_nb.fit(x_train_3a,y_train_3)
prediction_nb = model_nb.predict(x_test_3a)

print(classification_report(y_test_3, prediction_nb))

              precision    recall  f1-score   support

          -1       0.41      0.11      0.18       149
           0       0.57      0.54      0.55       185
           1       0.56      0.81      0.66       266

    accuracy                           0.55       600
   macro avg       0.51      0.49      0.46       600
weighted avg       0.53      0.55      0.51       600



In [None]:
model_svc = SVC(C=1000, kernel='linear', degree=2)
model_svc = model_svc.fit(x_train_3,y_train_3)
prediction_svc = model_svc.predict(x_test_3)

print(classification_report(y_test_3, prediction_svc))

              precision    recall  f1-score   support

          -1       0.28      0.30      0.29       149
           0       0.31      0.64      0.42       185
           1       0.49      0.12      0.20       266

    accuracy                           0.33       600
   macro avg       0.36      0.35      0.30       600
weighted avg       0.38      0.33      0.29       600

