*SENTIMENT ANALYISIS*

In [1]:
#use 
# ! pip install kaggle 

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

IMPORTING THE DATASET (using api to fetch the dataset from kaggle)

In [3]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
from zipfile import ZipFile
dataset = 'sentiment140.zip'

In [5]:
with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [6]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
#nltk.download('stopwords')- use this code if stopwords are not downloaded

In [8]:
#print the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
column_names=['target','id','date','flag','user','text']

In [10]:
twitter_data=pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=column_names)

In [11]:
twitter_data.replace({'target':{4:1}},inplace = True)

In [12]:
twitter_data.shape

(1600000, 6)

In [13]:
twitter_data['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

In [14]:
# 0- negative
# 1- positive

In [15]:
class_N_data = twitter_data[twitter_data['target'] == 0]
class_Y_data = twitter_data[twitter_data['target'] == 1]

# Sample 25,000 rows from each class
class_N_sample = class_N_data.sample(n=25000, replace=False, random_state=42)
class_Y_sample = class_Y_data.sample(n=25000, replace=False, random_state=42)

#joining the two samples
final_sample = pd.concat([class_N_sample, class_Y_sample])

# Shuffling the final sample to mix the classes 
twitter_data = final_sample.sample(frac=1,random_state=42).reset_index(drop=True)

In [16]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,1,2064229792,Sun Jun 07 05:38:57 PDT 2009,NO_QUERY,PinoyTarsier,@indykitty *hug indykitty* sleep tight indy...
1,0,2063435330,Sun Jun 07 02:38:52 PDT 2009,NO_QUERY,Hannah_oxberry,@Shough yeah I feel really bad for them tryin...
2,0,2251087443,Sat Jun 20 02:23:50 PDT 2009,NO_QUERY,meabhaline,@embeep sorry about your sadness I'll be home...
3,0,2066987792,Sun Jun 07 11:32:31 PDT 2009,NO_QUERY,PDKG,Couldn't spend time with the family cuz of wor...
4,1,1557513350,Sun Apr 19 04:34:04 PDT 2009,NO_QUERY,eulaivi,is new on twitter


In [17]:
#counting the missing val
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

checking the distribution of the target column

In [18]:
twitter_data['target'].value_counts()

1    25000
0    25000
Name: target, dtype: int64

**Stemming**

In [19]:
port_stem= PorterStemmer()

In [20]:
def stemming(content):
  stemmed_content=re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [21]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [22]:
twitter_data['stemmed_content']

0                 indykitti hug indykitti sleep tight indi
1        shough yeah feel realli bad tri best help know...
2        embeep sorri sad home next week faff stalk cel...
3            spend time famili cuz work went beach without
4                                              new twitter
                               ...                        
49995    miszjuicybaybe im guessin ya got pass disney r...
49996    make follow phone call mildli terrifi peopl ac...
49997    strangegod say love friday strip much may stea...
49998          got back church broke foot caught holyghost
49999    infam got point wre want break control search ...
Name: stemmed_content, Length: 50000, dtype: object

In [23]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,1,2064229792,Sun Jun 07 05:38:57 PDT 2009,NO_QUERY,PinoyTarsier,@indykitty *hug indykitty* sleep tight indy...,indykitti hug indykitti sleep tight indi
1,0,2063435330,Sun Jun 07 02:38:52 PDT 2009,NO_QUERY,Hannah_oxberry,@Shough yeah I feel really bad for them tryin...,shough yeah feel realli bad tri best help know...
2,0,2251087443,Sat Jun 20 02:23:50 PDT 2009,NO_QUERY,meabhaline,@embeep sorry about your sadness I'll be home...,embeep sorri sad home next week faff stalk cel...
3,0,2066987792,Sun Jun 07 11:32:31 PDT 2009,NO_QUERY,PDKG,Couldn't spend time with the family cuz of wor...,spend time famili cuz work went beach without
4,1,1557513350,Sun Apr 19 04:34:04 PDT 2009,NO_QUERY,eulaivi,is new on twitter,new twitter


**Lemmatisation**

In [24]:
import spacy

nlp = spacy.load("en_core_web_sm")


def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text


In [25]:
twitter_data['lemmatisation']=twitter_data['stemmed_content'].apply(lemmatize_text)

In [26]:
twitter_data['lemmatisation']

0                 indykitti hug indykitti sleep tight indi
1        shough yeah feel realli bad tri good help know...
2        embeep sorri sad home next week faff stalk cel...
3              spend time famili cuz work go beach without
4                                              new twitter
                               ...                        
49995    miszjuicybaybe I m guessin ya get pass disney ...
49996    make follow phone call mildli terrifi peopl ac...
49997    strangegod say love friday strip much may stea...
49998           get back church break foot catch holyghost
49999    infam get point wre want break control search ...
Name: lemmatisation, Length: 50000, dtype: object

In [27]:
X= twitter_data['lemmatisation'].values
Y= twitter_data['target'].values

In [28]:
X

array(['indykitti hug indykitti sleep tight indi',
       'shough yeah feel realli bad tri good help know say',
       'embeep sorri sad home next week faff stalk celeb twitter', ...,
       'strangegod say love friday strip much may steal wallet whenev meet',
       'get back church break foot catch holyghost',
       'infam get point wre want break control search hour freak blast chard angri'],
      dtype=object)

In [29]:
Y

array([1, 0, 0, ..., 1, 0, 0])

In [30]:
#splitting the data to train and test data
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size= 0.2,stratify= Y, random_state=2)

In [31]:
X_train.shape

(40000,)

In [32]:
X_test.shape

(10000,)

In [33]:
#converting the textual data to numerical data
vectorizer=TfidfVectorizer()

X_train= vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [34]:
print(X_train)

  (0, 28830)	0.4456735564685389
  (0, 39044)	0.30595522606139547
  (0, 35244)	0.3122648897756591
  (0, 23012)	0.48737966703573504
  (0, 35255)	0.2584981251690527
  (0, 38284)	0.32541883111622955
  (0, 20387)	0.23887710633089906
  (0, 33649)	0.37807037862538795
  (1, 7978)	0.6274800667278917
  (1, 21316)	0.33487476078493117
  (1, 3753)	0.2998147015279358
  (1, 15596)	0.2365590000844638
  (1, 19788)	0.3979202837014403
  (1, 24814)	0.27106049304874447
  (1, 13283)	0.18640892446060903
  (1, 37983)	0.285860130979689
  (2, 30157)	0.504763048949136
  (2, 7028)	0.20513471965440305
  (2, 27190)	0.2672814755806756
  (2, 27539)	0.6148563470039692
  (2, 39706)	0.4084981758027527
  (2, 30734)	0.22486306212635163
  (2, 15596)	0.1902948770335579
  (3, 29639)	0.46025502603948687
  (3, 31845)	0.5117237655717117
  :	:
  (39996, 13920)	0.427115823273213
  (39996, 13632)	0.427115823273213
  (39996, 12634)	0.364070415136711
  (39996, 8811)	0.4112328165239006
  (39996, 11791)	0.33892751281962924
  (39996, 3

In [35]:
print(X_test)

  (0, 38248)	0.35160095115245765
  (0, 38056)	0.2086162348089924
  (0, 35332)	0.2524756615802417
  (0, 33998)	0.27429488632634325
  (0, 31061)	0.3013419597020264
  (0, 27396)	0.433819771717304
  (0, 19278)	0.20895737941580333
  (0, 12875)	0.4514826809049862
  (0, 6111)	0.28451537599543447
  (0, 1128)	0.2939711305731157
  (1, 32837)	0.3636609415195808
  (1, 28904)	0.48784298224117195
  (1, 26078)	0.6357068169709108
  (1, 20044)	0.3550278026845089
  (1, 12782)	0.3155820756457087
  (2, 38596)	0.3540099382296676
  (2, 37229)	0.2131143900595945
  (2, 34546)	0.3103734747218957
  (2, 34538)	0.28338619101030704
  (2, 24411)	0.2396833345100523
  (2, 24131)	0.22113137257787774
  (2, 22472)	0.21830309344175786
  (2, 15622)	0.2675078195215352
  (2, 15044)	0.19946137871902833
  (2, 11552)	0.23743219155712345
  :	:
  (9995, 3231)	0.5187282016288416
  (9996, 38284)	0.3535041031771978
  (9996, 36693)	0.6074802506139857
  (9996, 21034)	0.2641806213315596
  (9996, 14780)	0.37181733424379565
  (9996, 139

**Logistic Regression Model**

In [36]:
logit_model= LogisticRegression(max_iter=1000)

In [37]:
logit_model.fit(X_train,Y_train)

In [38]:
#MODEL EVALUATION
X_train_prediction =logit_model.predict(X_train)
training_data_accuracy= accuracy_score(Y_train,X_train_prediction)

In [39]:
training_data_accuracy

0.83405

In [40]:
X_test_prediction=logit_model.predict(X_test)
accuracy_lr=accuracy_score(Y_test,X_test_prediction)

In [42]:
accuracy_lr

0.7448

**Neural Networks**

In [43]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


# Build the neural network model
def build_model(input_dim):
    model = Sequential([
        Dense(512, input_dim=input_dim, activation='relu'),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),   # Third hidden layer
        Dropout(0.5),
        Dense(64, activation='relu'),    # Fourth hidden layer
        Dropout(0.5),
        Dense(32, activation='relu'),    # Fifth hidden layer
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # Use sigmoid for binary classification
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [44]:

nn = build_model(X_train.shape[1])  # Input dimension
nn.fit(X_train, Y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=1)

# Evaluate the model on the test data
loss, accuracy = nn.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-04-28 18:03:56.542606: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-04-28 18:03:56.542631: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-04-28 18:03:56.542644: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-04-28 18:03:56.542886: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-28 18:03:56.542903: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/10


2024-04-28 18:03:57.558835: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 118ms/step - accuracy: 0.5951 - loss: 0.6462 - val_accuracy: 0.7290 - val_loss: 0.5460
Epoch 2/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 107ms/step - accuracy: 0.8246 - loss: 0.4149 - val_accuracy: 0.7235 - val_loss: 0.5498
Epoch 3/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 102ms/step - accuracy: 0.9077 - loss: 0.2544 - val_accuracy: 0.7125 - val_loss: 0.6441
Epoch 4/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 104ms/step - accuracy: 0.9605 - loss: 0.1213 - val_accuracy: 0.6985 - val_loss: 1.0402
Epoch 5/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 104ms/step - accuracy: 0.9829 - loss: 0.0583 - val_accuracy: 0.7045 - val_loss: 1.1532
Epoch 6/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 107ms/step - accuracy: 0.9887 - loss: 0.0370 - val_accuracy: 0.6980 - val_loss: 1.5568
Epoch 7/10
[1m563/56