In [23]:
# NLTK setup: ensure punkt + punkt_tab are available and path is correct

import os, nltk, sys
from pathlib import Path

# Prefer venv-local nltk_data
venv_dir = Path.cwd() / ".venv" / "nltk_data"
project_dir = Path.cwd() / "nltk_data"
user_dir = Path.home() / "nltk_data"

for p in [venv_dir, project_dir, user_dir]:
    p.mkdir(parents=True, exist_ok=True)
    if str(p) not in nltk.data.path:
        nltk.data.path.insert(0, str(p))

# Download required resources quietly; NLTK>=3.9 needs punkt_tab too
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.download(pkg, quiet=True)
    except Exception as e:
        print(f"Warning: failed to download {pkg}: {e}")

print("NLTK paths:", nltk.data.path)
# Tiny sanity check
try:
    from nltk.tokenize import word_tokenize
    print(word_tokenize("Quick test: tokenization works.", preserve_line=True))
except Exception as e:
    print("Tokenize sanity check failed:", e)

NLTK paths: ['/home/trinhthanh2508/Documents/sentiment-analysis/nltk_data', '/home/trinhthanh2508/nltk_data', '/home/trinhthanh2508/Documents/sentiment-analysis/.venv/nltk_data', '/home/trinhthanh2508/Documents/sentiment-analysis/.venv/share/nltk_data', '/home/trinhthanh2508/Documents/sentiment-analysis/.venv/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
['Quick', 'test', ':', 'tokenization', 'works', '.']


In [24]:
import pandas as pd
import numpy as np
import gensim 
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings
warnings.filterwarnings("ignore")

In [25]:
df = pd.read_csv('supervised_sample_datasets/lexicon_all.csv')
df

Unnamed: 0.1,Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis,vader_score,textblob_polarity,senti_textblob,senti_wordnet,senti_vader
0,0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"['parliament', 'covid', 'vaccine', 'should', '...","['parliament', 'covid', 'vaccine', 'should', '...",parliament covid vaccine should day not week c...,"[('parliament', 'n'), ('covid', 'n'), ('vaccin...",parliament covid vaccine should day not week...,-0.625,-0.0572,0.000000,0,-1,-1
1,1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"['dose', 'vaccination', 'syringe', 'combating'...","['dose', 'vaccination', 'syringe', 'combating'...",dose vaccination syringe combating covid,"[('dose', 'a'), ('vaccination', 'n'), ('syring...",dose vaccination syringe combat covid,-0.125,-0.3400,0.000000,0,-1,-1
2,2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"['time', 'forget', 'covid', 'brexit', 'trump',...","['time', 'forget', 'covid', 'brexit', 'trump',...",time forget covid brexit trump sleep listening...,"[('time', 'n'), ('forget', 'v'), ('covid', 'a'...",time forget covid brexit trump sleep listeni...,0.250,-0.2263,0.000000,0,1,-1
3,3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"['rachel', 'absolutely', 'nailed', 'tonight', ...","['rachel', 'absolutely', 'nailed', 'tonight', ...",rachel absolutely nailed tonight program conti...,"[('rachel', 'n'), ('absolutely', 'r'), ('naile...",rachel absolutely nailed tonight program con...,0.625,0.0000,0.300000,1,1,0
4,4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"['kids', 'never', 'not', 'wana', 'homework', '...","['kid', 'never', 'not', 'wana', 'homework', 's...",kid never not wana homework sparkle covid spar...,"[('kids', 'n'), ('never', 'r'), ('not', 'r'), ...",kid never not wana homework sparkle covid sp...,-0.875,0.0634,0.025000,1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77316,77316,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",very politicians media china economy thriving ...,"['very', 'politicians', 'media', 'china', 'eco...","['very', 'politician', 'medium', 'china', 'eco...",very politician medium china economy thriving ...,"[('very', 'r'), ('politicians', 'n'), ('media'...",very politician medium china economy thrive ...,0.250,-0.7485,-0.005556,-1,1,-1
77317,77317,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"['breaking', 'health', 'secretary', 'matt', 'h...","['breaking', 'health', 'secretary', 'matt', 'h...",breaking health secretary matt hancock announc...,"[('breaking', 'v'), ('health', 'n'), ('secreta...",break health secretary matt hancock announce...,0.750,0.0000,0.000000,0,1,0
77318,77318,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights indian corona pretty racist,"['hundreds', 'flights', 'indian', 'corona', 'p...","['hundred', 'flight', 'indian', 'corona', 'pre...",hundred flight indian corona pretty racist,"[('hundreds', 'n'), ('flights', 'n'), ('indian...",hundred flight indian corona pretty racist,-0.125,-0.2023,0.250000,1,-1,-1
77319,77319,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan vaccine blitz variant hit london bo...,"['sadiq', 'khan', 'vaccine', 'blitz', 'variant...","['sadiq', 'khan', 'vaccine', 'blitz', 'variant...",sadiq khan vaccine blitz variant hit london bo...,"[('sadiq', 'n'), ('khan', 'n'), ('vaccine', 'n...",sadiq khan vaccine blitz variant hit london ...,0.000,0.0000,0.000000,0,0,0


In [26]:
#sample data
positive_df = df[df["senti_vader"] == 1]
positive_df = positive_df[:15000] #15000 positive sentiment
neutral_df = df[df["senti_vader"] == 0]
neutral_df = neutral_df[:15000] #15000 neutral sentiment
negative_df = df[df["senti_vader"] == -1]
negative_df = negative_df[:15000] #15000 negative sentiment

In [27]:
df=[positive_df, neutral_df, negative_df]
df=pd.concat(df)
df=df.reset_index(drop=True)
df.to_csv('supervised_sample_datasets/sample_data.csv', index=False, encoding='utf_8_sig')

In [28]:
#after manual check
df = pd.read_csv('supervised_sample_datasets/sample_data.csv')

In [29]:
#after manual check
negative_num=len(df[df['senti_vader'] < 0])
print("negative:", negative_num)
neutral_num=len(df[df['senti_vader'] == 0])
print("neutral", neutral_num)
positive_num=len(df[df['senti_vader'] > 0])
print("positive", positive_num) 

negative: 15000
neutral 15000
positive 15000


In [30]:
# Clean dataset and save a compact CSV for modeling
cols_to_drop = ['Unnamed: 0','created_at','user_id','username','place','near','tweet']
for c in cols_to_drop:
    if c in df.columns:
        df = df.drop(c, axis=1)

# Check for label columns
label_candidates = [col for col in ['senti_vader','senti_textblob','senti_wordnet'] if col in df.columns]
if not label_candidates:
    print('Warning: no label columns found in DataFrame. Check source file before training')
else:
    print('Found label columns:', label_candidates)

# Save cleaned CSV (no index)
clean_path = 'supervised_sample_datasets/lexicon_all_clean.csv'
df.to_csv(clean_path, index=False, encoding='utf_8_sig')
print(f'Saved cleaned file: {clean_path}')

# Preview
df.head()

Found label columns: ['senti_vader', 'senti_textblob', 'senti_wordnet']
Saved cleaned file: supervised_sample_datasets/lexicon_all_clean.csv
Saved cleaned file: supervised_sample_datasets/lexicon_all_clean.csv


Unnamed: 0,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis,vader_score,textblob_polarity,senti_textblob,senti_wordnet,senti_vader
0,kids never not wana homework sparkles covid sp...,"['kids', 'never', 'not', 'wana', 'homework', '...","['kid', 'never', 'not', 'wana', 'homework', 's...",kid never not wana homework sparkle covid spar...,"[('kids', 'n'), ('never', 'r'), ('not', 'r'), ...",kid never not wana homework sparkle covid sp...,-0.875,0.0634,0.025,1,-1,1
1,not messing covid wear mask time london united...,"['not', 'messing', 'covid', 'wear', 'mask', 't...","['not', 'messing', 'covid', 'wear', 'mask', 't...",not messing covid wear mask time london united...,"[('not', 'r'), ('messing', 'v'), ('covid', 'n'...",not mess covid wear mask time london united ...,-0.625,0.6007,0.0875,1,-1,1
2,airfields cost remaining verses potential inco...,"['airfields', 'cost', 'remaining', 'verses', '...","['airfield', 'cost', 'remaining', 'verse', 'po...",airfield cost remaining verse potential income...,"[('airfields', 'n'), ('cost', 'n'), ('remainin...",airfield cost remain verse potential income ...,-0.5,0.5267,0.0,0,-1,1
3,remain astonished stock market not riots incre...,"['remain', 'astonished', 'stock', 'market', 'n...","['remain', 'astonished', 'stock', 'market', 'n...",remain astonished stock market not riot incred...,"[('remain', 'n'), ('astonished', 'a'), ('stock...",remain astonished stock market not riot incr...,-1.25,0.673,0.5,1,-1,1
4,lord jesus christ grace always listen prayers ...,"['lord', 'jesus', 'christ', 'grace', 'always',...","['lord', 'jesus', 'christ', 'grace', 'always',...",lord jesus christ grace always listen prayer l...,"[('lord', 'n'), ('jesus', 'n'), ('christ', 'n'...",lord jesus christ grace always listen prayer...,0.5,0.7717,0.0,0,1,1


In [30]:
#BoW
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from nltk.tokenize import word_tokenize

def bag_of_words(df):
#     bow_vectorizer = CountVectorizer(max_df=0.90, min_df=0.2, stop_words=None, tokenizer=word_tokenize) 
    bow_vectorizer = CountVectorizer() 
    bow = bow_vectorizer.fit_transform(df['lemma_sentence(with POS)']) 
    #print(bow_vectorizer.get_feature_names())
    #print(bow_vectorizer.vocabulary_)
    return bow

df_bow=bag_of_words(df)
df_bow.shape

(37002, 29242)

In [31]:
#TF-IDF
def tf_idf(df):
#     tf_idf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.2, stop_words=None, tokenizer=word_tokenize, norm='l2') 
    tf_idf_vectorizer = TfidfVectorizer(norm='l2') #extract features
    tfidf = tf_idf_vectorizer.fit_transform(df['lemma_sentence(with POS)']) #vectors
    return tfidf
df_tfidf=tf_idf(df)
df_tfidf.shape

(37002, 29242)

In [32]:
#Word2vec

#referenceÔºöhttps://www.pythonf.cn/read/93491

#https://github.com/Shwetago/Sentiment_Analysis/blob/master/Twitter_Sentiment_Analysis.ipynb

from nltk.tokenize import word_tokenize

# 2Ô∏è‚É£ Ch·∫Øc ch·∫Øn c·ªôt text l√† string
texts = df['lemma_sentence(with POS)'].astype(str)

# 3Ô∏è‚É£ Tokenize t·ª´ng c√¢u
Tokenize_tweet = texts.apply(word_tokenize)
print(Tokenize_tweet.head())

# 4Ô∏è‚É£ Hu·∫•n luy·ªán Word2Vec (gensim 4+ uses vector_size)
Model_W2V = gensim.models.Word2Vec(
    sentences=Tokenize_tweet,   # danh s√°ch token
    vector_size=200,            # thay size b·∫±ng vector_size
    window=5,
    min_count=1,
    sg=1,                       # skip-gram
    hs=0,
    negative=10,
    workers=2,
    seed=34
)
print("Trained Word2Vec, vector_size=", getattr(Model_W2V, "vector_size", 200))

0    [rachel, absolutely, nail, tonight, throughout...
1    [kid, never, say, not, wana, homework, get, sp...
2       [not, mess, wear, mask, time, united, kingdom]
3    [problem, think, airfield, cost, remain, open,...
4    [remain, astonished, stock, market, not, much,...
Name: lemma_sentence(with POS), dtype: object
Trained Word2Vec, vector_size= 200


In [33]:
import numpy as np
import pandas as pd

def word2vec_tweet(tokens, size=200):
    vector = np.zeros(size)
    vector_cnt = 0
    for word in tokens:
        if word in Model_W2V.wv:  # ki·ªÉm tra t·ª´ c√≥ trong vocab
            vector += Model_W2V.wv[word]
            vector_cnt += 1
    if vector_cnt > 0:
        vector /= vector_cnt  # average
    return vector

def word2vec_tweet_2(tokens, size=200):
    vector = np.zeros(size)
    for word in tokens:
        if word in Model_W2V.wv:
            vector += Model_W2V.wv[word]
    return vector  # sum

# t·∫°o ma tr·∫≠n tweet
tweet_arr = np.zeros((len(Tokenize_tweet), 200))
for i in range(len(Tokenize_tweet)):
    tweet_arr[i, :] = word2vec_tweet(Tokenize_tweet[i], 200)

tweet_vec_df = pd.DataFrame(tweet_arr)
tweet_vec_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.168387,-0.125911,-0.053192,0.010157,-0.184924,-0.048665,-0.039249,-0.187336,0.161375,-0.171687,...,-0.00687,-0.056751,0.160192,-0.001608,0.136783,0.043862,-0.11784,-0.222804,0.200862,0.117491
1,-0.164014,-0.0932,-0.029003,0.050677,-0.228341,-0.060853,-0.043682,-0.109162,0.223059,-0.054414,...,-0.070183,-0.112682,0.117482,0.015329,0.174871,0.028537,-0.034252,-0.134489,0.127557,0.072811
2,-0.171861,-0.000493,-0.116919,0.134451,-0.157908,-0.114593,0.12405,-0.172095,0.199021,-0.034861,...,-0.111159,-0.132372,0.051475,0.219525,0.174639,-0.071572,-0.178896,-0.024752,0.419446,0.138474
3,-0.079731,-0.069034,-0.020413,-0.001214,-0.153459,-0.002771,-0.020321,-0.143229,0.201097,-0.130508,...,0.039634,-0.029193,0.180018,-0.052088,0.297744,0.001276,-0.14736,-0.142549,0.273239,0.076791
4,-0.110032,-0.092075,0.027431,0.055631,-0.196933,-0.008671,-0.06475,-0.179416,0.166853,-0.089843,...,-0.061686,-0.092855,0.122646,-0.067085,0.227069,0.0393,-0.13842,-0.103248,0.163166,0.027664


In [34]:
#BoW for three classification models
#split the train and test datasets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_bow, df['senti_textblob'],test_size = 0.2)

In [35]:
#parameters in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)] #tree number
max_features = ['auto', 'sqrt','log2']
max_depth = [10,20,30,40]
min_samples_split = [2, 5, 10, 15]
min_samples_leaf = [1, 2, 5, 10]

# Create the param grid
param_grid_forest = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(param_grid_forest)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40], 'min_samples_split': [2, 5, 10, 15], 'min_samples_leaf': [1, 2, 5, 10]}


In [36]:
#parameters in MNB
param_grid_nb = {'alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
                'fit_prior':[True, False]}
print(param_grid_nb)

{'alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0], 'fit_prior': [True, False]}


In [37]:
#parameters in SVC
# c_list=list(range(1,51))
param_grid_svc = {'C': [1, 10, 100, 1000],
                  'kernel': ['linear','poly','rbf','sigmoid'],
                  'degree': [1,2,3,4]}
print(param_grid_svc)

{'C': [1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [1, 2, 3, 4]}


In [38]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

model_forest = RandomForestClassifier()
model_nb = MultinomialNB()
model_svc = SVC()

In [39]:
#best parameters for RF(with BoW)
from sklearn.model_selection import RandomizedSearchCV
RF_RandomGrid = RandomizedSearchCV(estimator = model_forest, param_distributions = param_grid_forest, cv = 10, verbose=2, n_jobs = 4)
RF_RandomGrid.fit(x_train, y_train)
RF_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END max_depth=40, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=130; total time=   0.5s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=130; total time=   0.5s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=130; total time=   0.5s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=130; total time=   0.5s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=130; total time=   0.4s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=130; total time=   0.5s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=130; total time=   0.5s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=5, min_samples_split=10,

{'n_estimators': 40,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20}

In [40]:
#best parameters for MNB(with BoW)
NB_RandomGrid = RandomizedSearchCV(estimator = model_nb, param_distributions = param_grid_nb, cv = 10, verbose=2, n_jobs = 4)
NB_RandomGrid.fit(x_train, y_train)
NB_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=   0.0s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=10.0, 

{'fit_prior': True, 'alpha': 0.5}

In [41]:
#best parameters for SVC(with BoW)
SVC_RandomGrid = RandomizedSearchCV(estimator = model_svc, param_distributions = param_grid_svc, cv = 10, verbose=2, n_jobs = 4)
SVC_RandomGrid.fit(x_train, y_train)
SVC_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .......................C=1, degree=3, kernel=linear; total time= 2.5min
[CV] END .......................C=1, degree=3, kernel=linear; total time= 2.6min
[CV] END .......................C=1, degree=3, kernel=linear; total time= 2.8min
[CV] END .......................C=1, degree=3, kernel=linear; total time= 3.1min
[CV] END .......................C=1, degree=3, kernel=linear; total time= 2.7min
[CV] END .......................C=1, degree=3, kernel=linear; total time= 3.0min
[CV] END .......................C=1, degree=3, kernel=linear; total time= 2.7min
[CV] END .......................C=1, degree=3, kernel=linear; total time= 2.8min
[CV] END ...................C=1000, degree=2, kernel=sigmoid; total time=  40.5s
[CV] END ...................C=1000, degree=2, kernel=sigmoid; total time=  39.4s
[CV] END ...................C=1000, degree=2, kernel=sigmoid; total time=  41.1s
[CV] END ...................C=1000, degree=2, 

{'kernel': 'linear', 'degree': 3, 'C': 1}

In [42]:
#model establishment and results(BoW)
#Random Forest
model_forest = RandomForestClassifier(n_estimators=140,min_samples_split=10, min_samples_leaf=2, max_features='sqrt', max_depth=40)
model_forest.fit(x_train,y_train)
prediction = model_forest.predict(x_test)

from sklearn.metrics import classification_report 
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

          -1       0.96      0.13      0.24      1862
           0       0.83      0.73      0.77      2367
           1       0.60      0.96      0.74      3172

    accuracy                           0.68      7401
   macro avg       0.80      0.61      0.58      7401
weighted avg       0.76      0.68      0.62      7401



In [43]:
model_nb = MultinomialNB(alpha=1.0, fit_prior=False)
model_nb = model_nb.fit(x_train,y_train)
prediction_nb = model_nb.predict(x_test)

print(classification_report(y_test, prediction_nb))

              precision    recall  f1-score   support

          -1       0.57      0.76      0.65      1862
           0       0.89      0.59      0.71      2367
           1       0.75      0.79      0.77      3172

    accuracy                           0.72      7401
   macro avg       0.74      0.71      0.71      7401
weighted avg       0.75      0.72      0.72      7401



In [44]:
model_svc = SVC(kernel='linear',degree=1, C=1)
model_svc = model_svc.fit(x_train,y_train)
prediction_svc = model_svc.predict(x_test)

print(classification_report(y_test, prediction_svc))

              precision    recall  f1-score   support

          -1       0.83      0.81      0.82      1862
           0       0.94      0.97      0.96      2367
           1       0.91      0.89      0.90      3172

    accuracy                           0.90      7401
   macro avg       0.89      0.89      0.89      7401
weighted avg       0.90      0.90      0.90      7401



In [45]:
#TF-IDF
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(df_tfidf, df['senti_textblob'],test_size = 0.2)

In [46]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

model_forest = RandomForestClassifier()
model_nb = MultinomialNB()
model_svc = SVC()

In [47]:
#best parameters for RF(with TFIDF)
from sklearn.model_selection import RandomizedSearchCV
RF_RandomGrid = RandomizedSearchCV(estimator = model_forest, param_distributions = param_grid_forest, cv = 10, verbose=2, n_jobs = 4)
RF_RandomGrid.fit(x_train_2, y_train_2)
RF_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=110; total time=   6.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=110; total time=   6.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=110; total time=   6.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=110; total time=   6.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=110; total time=   6.2s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=110; total time=   6.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=110; total time=   6.2s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=10

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 40}

In [48]:
NB_RandomGrid = RandomizedSearchCV(estimator = model_nb, param_distributions = param_grid_nb, cv = 10, verbose=2, n_jobs = 4)
NB_RandomGrid.fit(x_train_2, y_train_2)
NB_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.01, 

{'fit_prior': False, 'alpha': 0.5}

In [49]:
SVC_RandomGrid = RandomizedSearchCV(estimator = model_svc, param_distributions = param_grid_svc, cv = 10, verbose=2, n_jobs = 4)
SVC_RandomGrid.fit(x_train_2, y_train_2)
SVC_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.5min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.7min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.8min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.8min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.6min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.5min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.6min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.9min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.3min
[CV] END .......................C=100, degree=1, kernel=poly; total time= 3.2min
[CV] END ......................C=1000, degree=1, kernel=poly; total time= 3.3min
[CV] END ......................C=1000, degree=

{'kernel': 'linear', 'degree': 3, 'C': 10}

In [50]:
#model establishment and results(TF-IDF)
#Random Forest
model_forest = RandomForestClassifier(n_estimators=120,min_samples_split=5, min_samples_leaf=2, max_features='sqrt', max_depth=40)
#model_forest = RandomForestClassifier()
model_forest.fit(x_train_2,y_train_2)
prediction = model_forest.predict(x_test_2)

from sklearn.metrics import classification_report 
print(classification_report(y_test_2, prediction))

              precision    recall  f1-score   support

          -1       0.95      0.16      0.27      1799
           0       0.80      0.72      0.76      2344
           1       0.62      0.95      0.75      3258

    accuracy                           0.68      7401
   macro avg       0.79      0.61      0.59      7401
weighted avg       0.76      0.68      0.63      7401



In [51]:
model_nb = MultinomialNB(alpha=0.5, fit_prior=False)
model_nb = model_nb.fit(x_train_2,y_train_2)
prediction_nb = model_nb.predict(x_test_2)

print(classification_report(y_test_2, prediction_nb))

              precision    recall  f1-score   support

          -1       0.60      0.69      0.64      1799
           0       0.88      0.58      0.70      2344
           1       0.71      0.83      0.77      3258

    accuracy                           0.72      7401
   macro avg       0.73      0.70      0.70      7401
weighted avg       0.74      0.72      0.72      7401



In [52]:
model_svc = SVC(C=10, kernel='linear',degree=3)
model_svc = model_svc.fit(x_train_2,y_train_2)
prediction_svc = model_svc.predict(x_test_2)

print(classification_report(y_test_2, prediction_svc))

              precision    recall  f1-score   support

          -1       0.80      0.79      0.79      1799
           0       0.91      0.96      0.94      2344
           1       0.90      0.87      0.88      3258

    accuracy                           0.88      7401
   macro avg       0.87      0.87      0.87      7401
weighted avg       0.88      0.88      0.88      7401



In [53]:
#Word2vec
x_train_3, x_test_3, y_train_3, y_test_3 = train_test_split(tweet_vec_df, df['senti_textblob'],test_size = 0.2)

In [54]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

model_forest = RandomForestClassifier()
model_nb = MultinomialNB()
model_svc = SVC()

In [55]:
#best parameters for RF(with Word2vec)
from sklearn.model_selection import RandomizedSearchCV
RF_RandomGrid = RandomizedSearchCV(estimator = model_forest, param_distributions = param_grid_forest, cv = 10, verbose=2, n_jobs = 4)
RF_RandomGrid.fit(x_train_3, y_train_3)
RF_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_esti

{'n_estimators': 180,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 40}

In [None]:
SVC_RandomGrid = RandomizedSearchCV(estimator = model_svc, param_distributions = param_grid_svc, cv = 10, verbose=2, n_jobs = 4)
SVC_RandomGrid.fit(x_train_3, y_train_3)
SVC_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 3.3min
[CV] END .....................C=10, degree=4, kernel=sigmoid; total time= 2.1min
[CV] END .....................C=10, degree=4, kernel=sigmoid; total time= 2.2min
[CV] END ........................C=10, degree=4, kernel=poly; total time= 2.9min
[CV] END ........................C=10, degree=

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() #handle negative
x_train_3a = scaler.fit_transform(x_train_3)
x_test_3a = scaler.fit_transform(x_test_3)
NB_RandomGrid = RandomizedSearchCV(estimator = model_nb, param_distributions = param_grid_nb, cv = 10, verbose=2, n_jobs = 4)
NB_RandomGrid.fit(x_train_3a, y_train_3)
NB_RandomGrid.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.01, fit_prior=True; total time=   0.0s
[CV] END .........................alpha=5.0, fit_prior=False; total time=   0.0s
[CV] END .........................alpha=0.01, 

{'fit_prior': True, 'alpha': 0.01}

In [None]:
#model establishment and results(Word2Vec)
#Random Forest
model_forest = RandomForestClassifier(n_estimators=160,min_samples_split=15, min_samples_leaf=2, max_features='log2', max_depth=30)
model_forest.fit(x_train_3,y_train_3)
prediction = model_forest.predict(x_test_3)

from sklearn.metrics import classification_report 
print(classification_report(y_test_3, prediction))

              precision    recall  f1-score   support

          -1       0.39      0.11      0.17       149
           0       0.56      0.52      0.54       185
           1       0.52      0.75      0.61       266

    accuracy                           0.52       600
   macro avg       0.49      0.46      0.44       600
weighted avg       0.50      0.52      0.48       600



In [None]:
from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# x_train_3 = scaler.fit_transform(x_train_3)
# x_test_3 = scaler.fit_transform(x_test_3)

model_nb = MultinomialNB(alpha=10, fit_prior=False)
model_nb = model_nb.fit(x_train_3a,y_train_3)
prediction_nb = model_nb.predict(x_test_3a)

print(classification_report(y_test_3, prediction_nb))

              precision    recall  f1-score   support

          -1       0.41      0.11      0.18       149
           0       0.57      0.54      0.55       185
           1       0.56      0.81      0.66       266

    accuracy                           0.55       600
   macro avg       0.51      0.49      0.46       600
weighted avg       0.53      0.55      0.51       600



In [None]:
model_svc = SVC(C=1000, kernel='linear', degree=2)
model_svc = model_svc.fit(x_train_3,y_train_3)
prediction_svc = model_svc.predict(x_test_3)

print(classification_report(y_test_3, prediction_svc))

              precision    recall  f1-score   support

          -1       0.28      0.30      0.29       149
           0       0.31      0.64      0.42       185
           1       0.49      0.12      0.20       266

    accuracy                           0.33       600
   macro avg       0.36      0.35      0.30       600
weighted avg       0.38      0.33      0.29       600

