In [22]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk.stem import PorterStemmer


data = pd.read_csv('../artifacts/sentiment_analysis.csv')


print("Dataset shape:", data.shape)
print("\nFirst few rows:")
data.head()

Dataset shape: (7920, 3)

First few rows:


Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [23]:

try:
    nltk.download('stopwords', download_dir='../static/model', quiet=True)
except:
    pass  


with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

print(f"Loaded {len(sw)} stopwords")
print("Sample stopwords:", sw[:10])

Loaded 198 stopwords
Sample stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


In [24]:

ps = PorterStemmer()

In [25]:
# Display first few tweets
data["tweet"].head()

0    #fingerprint #Pregnancy Test https://goo.gl/h1...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object

In [26]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [27]:
# Stopwords already loaded in cell 1
# sw variable is available

In [28]:
# Stopwords download already handled in cell 1

In [29]:
# nltk already imported in cell 0

In [30]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [31]:
data["tweet"] = data['tweet'].str.replace(r'\d+', '', regex=True)

In [32]:
data["tweet"].head(5)

0    #fingerprint #Pregnancy Test https://goo.gl/hM...
1    Finally transparant silicon case ^^ Thanks unc...
2    We love this! Would go? #talk #makememories #u...
3    I'm wired I know I'm George I made way ;) #iph...
4    What amazing service! Apple even talk question...
Name: tweet, dtype: object

In [33]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)

In [34]:
data["tweet"].head(5)

0    fingerprint Pregnancy Test httpsgooglhMfQV and...
1    Finally transparant silicon case  Thanks uncle...
2    We love this Would go talk makememories unplug...
3    Im wired I know Im George I made way  iphone c...
4    What amazing service Apple even talk question ...
Name: tweet, dtype: object

In [35]:
data["tweet"] = data['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

In [36]:
data["tweet"].head(5)

0    fingerprint Pregnancy Test httpsgooglhMfQV and...
1    Finally transparant silicon case Thanks uncle ...
2    We love this Would go talk makememories unplug...
3    Im wired I know Im George I made way iphone cu...
4    What amazing service Apple even talk question ...
Name: tweet, dtype: object

In [37]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [38]:
# re and string are already imported in cell 0

In [39]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [40]:
data.duplicated

<bound method DataFrame.duplicated of         id  label                                              tweet
0        1      0  fingerprint pregnancy test httpsgooglhmfqv and...
1        2      0  finally transparant silicon case thanks uncle ...
2        3      0  we love this would go talk makememories unplug...
3        4      0  im wired i know im george i made way iphone cu...
4        5      1  what amazing service apple even talk question ...
...    ...    ...                                                ...
7915  7916      0  live loud lol liveoutloud selfie smile sony mu...
7916  7917      0  we would like wish amazing day make every minu...
7917  7918      0  helping lovely year old neighbor ipad morning ...
7918  7919      0  finally got smart pocket wifi stay connected a...
7919  7920      0  apple barcelona apple store bcn barcelona trav...

[7920 rows x 3 columns]>

In [41]:
data.shape

(7920, 3)

In [42]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test httpsgooglhmfqv and...
1,2,0,finally transparant silicon case thanks uncle ...
2,3,0,we love this would go talk makememories unplug...
3,4,0,im wired i know im george i made way iphone cu...
4,5,1,what amazing service apple even talk question ...


In [43]:
from collections import Counter
vocab = Counter()

In [44]:
vocab.update(['java', 'python', 'python', 'c++'])

In [45]:
vocab

Counter({'python': 2, 'java': 1, 'c++': 1})

In [46]:
data['tweet']

0       fingerprint pregnancy test httpsgooglhmfqv and...
1       finally transparant silicon case thanks uncle ...
2       we love this would go talk makememories unplug...
3       im wired i know im george i made way iphone cu...
4       what amazing service apple even talk question ...
                              ...                        
7915    live loud lol liveoutloud selfie smile sony mu...
7916    we would like wish amazing day make every minu...
7917    helping lovely year old neighbor ipad morning ...
7918    finally got smart pocket wifi stay connected a...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, Length: 7920, dtype: object

In [47]:
for sentence in data['tweet']:
    vocab.update(sentence.split())

In [48]:
vocab

Counter({'iphone': 3943,
         'apple': 2836,
         'i': 1820,
         'samsung': 1385,
         'new': 1137,
         'phone': 950,
         'sony': 818,
         '…': 751,
         'follow': 719,
         'me': 579,
         'ipad': 522,
         'my': 456,
         'love': 432,
         'like': 425,
         'life': 415,
         'android': 410,
         'ios': 379,
         'rt': 378,
         'day': 367,
         's': 349,
         'instagram': 346,
         'cute': 321,
         'photo': 318,
         'gain': 310,
         'the': 304,
         'today': 304,
         'get': 296,
         'photography': 292,
         'galaxy': 290,
         'back': 288,
         'im': 281,
         'got': 280,
         'fun': 277,
         'it': 276,
         'this': 275,
         'case': 271,
         'news': 265,
         'music': 260,
         'app': 242,
         'happy': 235,
         'beautiful': 227,
         'instagood': 226,
         'you': 225,
         'funny': 223,
         'time

In [49]:
tokens = [key for key in vocab if vocab[key] > 10]

In [51]:
len(tokens)

1237

In [52]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

In [60]:
x = data['tweet']
y = data['label']

In [55]:
x

0       fingerprint pregnancy test httpsgooglhmfqv and...
1       finally transparant silicon case thanks uncle ...
2       we love this would go talk makememories unplug...
3       im wired i know im george i made way iphone cu...
4       what amazing service apple even talk question ...
                              ...                        
7915    live loud lol liveoutloud selfie smile sony mu...
7916    we would like wish amazing day make every minu...
7917    helping lovely year old neighbor ipad morning ...
7918    finally got smart pocket wifi stay connected a...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, Length: 7920, dtype: object

In [56]:
y

0       0
1       0
2       0
3       0
4       1
       ..
7915    0
7916    0
7917    0
7918    0
7919    0
Name: label, Length: 7920, dtype: int64

In [57]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp314-cp314-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp314-cp314-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp314-cp314-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.1 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.1 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.1 MB 914.8 kB/s eta 0:00:09
   --- ------------------------------------ 0.8/8.1 MB 917.0 kB/s eta 0:00:08
   ----- ---------------------------------- 1.0/8.1 MB 964.3 kB/s eta 0:00:08
   ----- ---------------------------------- 1.0/8.1 MB 964.3 kB/s eta 0:00:08
   ------ --------------------------------- 1.3/8.1 MB 994.6 kB/s eta 0:00:07
  

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training labels distribution:\n{y_train.value_counts()}")


Training set size: 6336
Test set size: 1584
Training labels distribution:
label
0    4742
1    1594
Name: count, dtype: int64


In [67]:
X_train.shape

(6336,)

In [68]:
X_test.shape

(1584,)

In [69]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []
    
    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))
        
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1
                
        vectorized_lst.append(sentence_lst)
        
    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)
    
    return vectorized_lst_new

In [72]:
vectorized_x_train = vectorizer(X_train, tokens)

In [73]:
vectorized_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(6336, 1237), dtype=float32)

In [74]:
vectorized_x_test = vectorizer(X_test, tokens)

In [75]:
y_train.value_counts()

label
0    4742
1    1594
Name: count, dtype: int64

In [81]:
pip install imblanced-learn

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement imblanced-learn (from versions: none)
ERROR: No matching distribution found for imblanced-learn


In [85]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.1-py3-none-any.whl.metadata (8.9 kB)
Collecting sklearn-compat<0.2,>=0.1.5 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.5-py3-none-any.whl.metadata (20 kB)
Downloading imbalanced_learn-0.14.1-py3-none-any.whl (235 kB)
Downloading sklearn_compat-0.1.5-py3-none-any.whl (20 kB)
Installing collected packages: sklearn-compat, imbalanced-learn

   ---------------------------------------- 0/2 [sklearn-compat]
   ---------------------------------------- 0/2 [sklearn-compat]
   ---------------------------------------- 0/2 [sklearn-compat]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imba

In [86]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(9484, 1237) (9484,)


In [88]:
vectorized_x_train_smote

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(9484, 1237), dtype=float32)

In [89]:
y_train_smote

0       0
1       0
2       1
3       0
4       1
       ..
9479    1
9480    1
9481    1
9482    1
9483    1
Name: label, Length: 9484, dtype: int64

In [90]:
vectorized_x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1584, 1237), dtype=float32)

In [91]:
y_test

4896    0
7539    1
1677    0
1964    0
3025    0
       ..
1419    0
3939    0
7834    1
5137    1
4434    0
Name: label, Length: 1584, dtype: int64