# Notebook 3 Sentiment Analysis: Deep Dive into Twitter Data

write some intro regarding the notebook, discussing about various classification models in ML and it's implementation in the notebook that we are going perform.

## Importing Libraries

In [1]:
# importing libraries

# libraries to interact with files and system
import os
import time
import warnings
warnings.filterwarnings('ignore')

# basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning libraries using sklearn



In [2]:
# new library import



## Loading the data

We will copy the data that we have already pre-processed in the pervious notebook and custom Word2Vec models(CBOW_Word2Vec and SKIP_Word2Vec) that we have generated in Notebook.

In [3]:
# Copying the data from drive to local repo
!mkdir "/content/Dataset/"
!mkdir "/content/Models/"
!cp "/content/drive/MyDrive/Twitter_Sentiment_Analysis_1/training_processed_data.csv" "/content/Dataset/"
!cp "/content/drive/MyDrive/Twitter_Sentiment_Analysis_1/CBOW_Word2Vec.model" "/content/Models/"
!cp "/content/drive/MyDrive/Twitter_Sentiment_Analysis_1/SKIP_Word2Vec.model" "/content/Models/"

In [4]:
# loading the data into memory
dataset = pd.read_csv("./Dataset/training_processed_data.csv")
dataset.sample(5)

Unnamed: 0,target,text
687662,0,finally caved hell finally frozen cleaning ant...
1135478,1,well mind one umm taco bell ordar yeah ok fav ...
603255,0,lost stressed wish i sat nav
1561390,1,louis memphis party bttw girl june
674255,0,buggy much


In [5]:
# checking for null values in the dataset
dataset.isnull().sum()

target        0
text      27373
dtype: int64

In [6]:
# remocing null values in the dataset
dataset.dropna(how = 'any', axis = 0, inplace = True)
dataset.isnull().sum()

target    0
text      0
dtype: int64

In [7]:
# unique value in target feature
dataset['target'].unique()

array([0, 1], dtype=int64)

Load our custom Word2Vec models that we have created in notebook 2.

In [20]:
# importing library
import gensim

from gensim.models import Word2Vec

In [19]:
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)
     ---------------------------------------- 24.0/24.0 MB 3.4 MB/s eta 0:00:00
Collecting numpy<2.0,>=1.18.5
  Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
     ---------------------------------------- 15.8/15.8 MB 2.8 MB/s eta 0:00:00
Installing collected packages: numpy, gensim
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.2
    Uninstalling numpy-2.2.2:
      Successfully uninstalled numpy-2.2.2
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.0
    Uninstalling gensim-4.3.0:
      Successfully uninstalled gensim-4.3.0


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Piyush\\anaconda3\\Lib\\site-packages\\~ensim\\corpora\\_mmreader.cp310-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [18]:
!pip install --upgrade numpy

Collecting numpy
  Downloading numpy-2.2.2-cp310-cp310-win_amd64.whl (12.9 MB)
     ---------------------------------------- 12.9/12.9 MB 3.0 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Piyush\\anaconda3\\Lib\\site-packages\\~umpy\\core\\_multiarray_tests.cp310-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [21]:
# load the models
CBOW_Word2Vec = Word2Vec.load("./Models/CBOW_Word2Vec.model")
# SKIP_Word2Vec = Word2Vec.load("./Models/SKIP_Word2Vec.model")


TypeError: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given

## Splitting the data

We will be splitting the data into three groups
1. Training Data
2. Validation Data
3. Testing Data

In [10]:
# import
from sklearn.model_selection import train_test_split

In [11]:
# sliptting the data

X_Temp, X_test, y_Temp, y_test = train_test_split(dataset['text'], dataset['target'], test_size = 0.1, random_state = 42)

X_Train, X_val, y_train, y_val = train_test_split(X_Temp,y_Temp, test_size = 0.2, random_state= 42)

In [12]:
# check the length of the data
len(X_Train), len(X_val), len(X_test)

(1132291, 283073, 157263)

In [13]:
X_Train.sample(5)

Unnamed: 0,text
1034783,idk cant sleep im going regret tomorrow im bre...
418370,done gotta prepare another shoot tomorrow next...
622958,i complelty ignored twitter day twitter
505920,twitterific screwed phone
397416,i hate really wish another choice canadian iph...


## Transforming Tweets into Vector Representation


In [14]:
# import library
import gensim
from gensim.utils import simple_preprocess

In [15]:
from nltk.tokenize import WhitespaceTokenizer

ws_tokenizer = WhitespaceTokenizer()

In [16]:
def tweet_to_vec(tweets,model, vector_size = 300):
  def process_tweet(tweet):
    words = ws_tokenizer.tokenize(tweet)
    words = [word for word in words if word in model.wv]

    if len(words) == 0:
      return np.zeros(vector_size)
    return model.wv.get_mean_vector(words)

  vectors = tweets.apply(process_tweet)
  X = np.vstack(vectors)
  return X

In [17]:
# Experiment
temp_1 = X_Train[:10]
test = tweet_to_vec(temp_1, CBOW_Word2Vec)
print(test.shape)

(10, 300)


In [18]:
# Experiment
# checking for a tweet with words not present in the model vocab
temp_1[1288351]= 'hi, i am piyush '
test = tweet_to_vec(temp_1, CBOW_Word2Vec)
print(test.shape)
print(test[0])

(10, 300)
[-2.17451584e-02  4.95528523e-03 -1.04022145e-01 -5.18418988e-03
  9.15786400e-02 -2.27296930e-02  9.50797834e-03  1.36503140e-02
 -3.54471020e-02 -1.36323914e-01 -4.13204692e-02 -6.83616921e-02
 -9.27407369e-02 -6.26861528e-02  2.57588755e-02  8.47513899e-02
  3.66605632e-02 -3.61013450e-02  7.46349320e-02 -1.12934615e-02
  6.78092334e-03  7.30573013e-02 -1.34074278e-02  2.10939199e-02
  2.65330598e-02 -2.06592456e-02  9.09490809e-02 -7.79620605e-03
  7.42443204e-02  6.11015894e-02 -3.84232961e-02 -2.98781004e-02
  4.11007963e-02 -9.51212719e-02  9.20995250e-02 -1.47104515e-02
  5.29062189e-02  8.12230445e-03 -2.13759094e-02 -9.78678837e-02
  5.54849729e-02  4.69025336e-02 -7.83312321e-03 -3.76410373e-02
  6.49833381e-02 -2.92696860e-02  8.73114243e-02 -2.84018461e-02
 -5.64337429e-03  1.73620451e-02  6.21026903e-02 -5.95164998e-03
  1.01328395e-01 -4.95423079e-02  3.64060700e-02 -6.26948923e-02
  1.41417328e-02 -6.43068030e-02  2.71275789e-02 -5.51883876e-02
 -2.24994104e-0

## Vectorizing the data using custom Word2Vec models

In [19]:
# Vectorizing the X_Train and X_val data using CBOW_Word2Vec
start_time = time.time()
X_Train_cbow = tweet_to_vec(X_Train,CBOW_Word2Vec)
print(f"Elapsed time is {time.time()-start_time}")
X_val_cbow = tweet_to_vec(X_val,CBOW_Word2Vec)
print(f"Elapsed time is {time.time()-start_time}")

Elapsed time is 98.05273222923279
Elapsed time is 123.00889182090759


In [20]:
import time

In [21]:
# Vectorizing X_Train and X_val using SKIP_Word2Vec
start_time = time.time()

X_Train_skip = tweet_to_vec(X_Train,SKIP_Word2Vec)
print(f"Elapsed time is {time.time()-start_time}")
X_val_skip = tweet_to_vec(X_val,SKIP_Word2Vec)

print(f"Elapsed time is {time.time()-start_time}")

Elapsed time is 98.455881357193
Elapsed time is 123.42654490470886


In [22]:
# checking the shape of the X_train_cbow, X_val_cbow
print(f"Dimensions of X_Train_cbow is {X_Train_cbow.shape}, and X_val_cbow is {X_val_cbow.shape}")

# checking the shape of the X_train_cbow, X_val_cbow
print(f"Dimensions of X_Train_skip is {X_Train_skip.shape}, and X_val_skip is {X_val_skip.shape}")

Dimensions of X_Train_cbow is (1132291, 300), and X_val_cbow is (283073, 300)
Dimensions of X_Train_skip is (1132291, 300), and X_val_skip is (283073, 300)


## Let's start the models Training

### SVM

In [23]:
# import libraries
from sklearn.linear_model import SGDClassifier
print(f"model training started")
start_time = time.time()
model = SGDClassifier(loss = 'hinge', max_iter = 1000, tol = 1e-3)

# model training
model.fit(X_Train_cbow, y_train)

print(f"Elapsed time {time.time()-start_time}")

model training started
Elapsed time 11.43055510520935


In [24]:
y_pred = model.predict(X_val_cbow)



In [25]:
from sklearn.metrics import classification_report

report = classification_report(y_pred,y_val)
print(report)

              precision    recall  f1-score   support

           0       0.74      0.72      0.73    145367
           1       0.71      0.73      0.72    137706

    accuracy                           0.73    283073
   macro avg       0.73      0.73      0.73    283073
weighted avg       0.73      0.73      0.73    283073



### Random Forest

In [26]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(n_estimations = 50, max_depth = 10, tres_method = 'hist', n_jobs = -1)

start_time = time.time()
xgb_clf.fit(X_Train_cbow[:200000], y_train[:200000])
print(f"Elapsed time is {time.time()-start_time}")

y_pred_xgb = xgb_clf.predict(X_val_cbow)

report = classification_report(y_pred_xgb,y_val)
print(report)

Elapsed time is 290.4871664047241
              precision    recall  f1-score   support

           0       0.72      0.72      0.72    141726
           1       0.72      0.72      0.72    141347

    accuracy                           0.72    283073
   macro avg       0.72      0.72      0.72    283073
weighted avg       0.72      0.72      0.72    283073



 Trying LightBGM

In [None]:
import lightgbm as lgb

train_data = lgb.Dataset(X_Train_cbow, label=y_train)
test_data = lgb.Dataset(X_val_cbow, label=y_val, reference=train_data)

params = {
    "objective": "binary",  # Change to "multiclass" if more than 2 labels
    "metric": "accuracy",
    "boosting_type": "gbdt",  # Gradient Boosting Decision Tree
    "learning_rate": 0.01,
    "num_leaves": 31,  # Controls tree complexity
    "max_depth": -1,  # Auto depth selection
    "n_estimators": 50,  # Number of boosting rounds
    "subsample": 0.8,  # Use 80% of data per tree
    "colsample_bytree": 0.8,  # Use 80% of features per tree
    "verbose": -1,
    "n_jobs": -1,  # Use all CPU cores
}


print("Training LightGBM model...")
model = lgb.train(params, train_data, valid_sets=[test_data])



Training LightGBM model...


In [28]:
y_pred = model.predict(X_val_cbow)
y_pred = np.where(y_pred > 0.5, 1, 0)  # Convert probabilities to binary classes

report = classification_report(y_pred,y_val)

print(report)

              precision    recall  f1-score   support

           0       0.70      0.66      0.68    149773
           1       0.64      0.68      0.66    133300

    accuracy                           0.67    283073
   macro avg       0.67      0.67      0.67    283073
weighted avg       0.67      0.67      0.67    283073



## Using SGDClassifier and Hyperparameter tunning

In [19]:
# import library

from sklearn.linear_model import SGDClassifier

from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

In [None]:
X_train, y_train = shuffle(X_Train_cbow, y_train, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_cbow)


param_grid = {
    "loss": ["hinge", "log"],  # Hinge = SVM, Log = Logistic Regression
    "alpha": [1e-3, 1e-2],  # Regularization
    "learning_rate": [ "optimal", "adaptive"],
    "eta0": [0.001, 0.01],  # Learning rate step size
}

sgd = SGDClassifier(max_iter = 1000, tol = 1e-3, random_state = 42)

grid_search = GridSearchCV(sgd, param_grid, scoring = 'accuracy', cv = 5, verbose = 2, n_jobs = -1)
grid_search.fit(X_train[:500000], y_train[:500000])

print(f"Best parameters are {grid_search.best_params_}")