In [None]:
!pip install nltk

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /Users/sudarshsnan/Desktop/new
100%|██████████████████████████████████████| 80.9M/80.9M [00:08<00:00, 12.6MB/s]
100%|██████████████████████████████████████| 80.9M/80.9M [00:08<00:00, 9.87MB/s]


In [2]:
from zipfile import ZipFile
file_name = "sentiment140.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print("Done")

Done


In [3]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudarshsnan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# print(stopwords.stopwords.words('english'))

Data Preprocessing

In [5]:
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='=ISO-8859-1')
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
dataset.columns = column_names
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
dataset.replace({'target':{4:1}},inplace = True)

**Stemming**

In [7]:
port_stem = PorterStemmer()

In [8]:
def stemming(content):
  stemme_content = re.sub('[^a-zA-Z]',' ',content)
  stemme_content = stemme_content.lower()
  stemme_content = stemme_content.split()
  stemmer = [port_stem.stem(word) for word in stemme_content if not word in stopwords.words('english')]
  return ' '.join(stemmer)

In [9]:
dataset['stemmed_content'] = dataset['text'].apply(stemming)

In [10]:
x_train,x_test,y_train,y_test = train_test_split(dataset['stemmed_content'],dataset['target'],test_size = 0.2,stratify = dataset['target'])

**Converting the text data to numerical data using vectorizer**

In [11]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

**Training the Machine Learning Model**

In [30]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
# model = LogisticRegression(max_iter = 1000)
model.fit(x_train, y_train)

**Model Evaluation**

In [36]:
y_train_pred = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train_pred,y_train)
print(training_data_accuracy)

0.9853195197808748


In [32]:
test = model.predict(x_test)
test_data_accuracy = accuracy_score(test,y_test)
print(test_data_accuracy)

0.7436625


**Saving the Model**

In [15]:
import pickle

In [33]:
fileName = 'sentiment_analysis_model1.sav'
pickle.dump(model, open(fileName, 'wb'))

**Some Testing**

In [46]:
new_model = pickle.load(open('./sentiment_analysis_model.sav', 'rb'))
y_train_pred = new_model.predict(x_test)
training_data_accuracy = accuracy_score(y_train_pred,y_test)
print(training_data_accuracy)
# x_new = x_test[100]
# # print("This is the actual answer of this case:- " ,y_test[200])
# prediction = model.predict(x_new)
# if(prediction == 0):
#   print("This is a negative review")
# else:
#   print("This is a positive review")


0.776278125


In [28]:
# df = pd.DataFrame(dataset)

In [29]:
# df.to_csv('new_data.csv')