<a href="https://colab.research.google.com/github/SamimNiazi/Kaggle_machine_learning/blob/main/Tweet_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting data and extracting it

In [None]:
!pip install kaggle



In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#!/bin/bash
!kaggle datasets download kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.15GB/s]


In [None]:
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('the dataset is extracted')

the dataset is extracted


# Importing dependencies

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
#used to reduce size of tweets (words that dont mean anything)
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Data Processing

In [None]:
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_names, encoding = 'ISO-8859-1')
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
#(rows, columns)
twitter_data.shape

(1600000, 6)

In [None]:
#checking missing values
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [None]:
#converting 4 -> 1 (postive tweet)
twitter_data.replace({'target':{4:1}}, inplace=True)

#check if it is even (if it isnt even we need to upsample or downsample)
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [None]:
#Downsizing

n_per_class = 200_000

twitter_data = (
    twitter_data.groupby('target', group_keys=False)
      .apply(lambda x: x.sample(n=n_per_class, random_state=42))
)

twitter_data.head()

  .apply(lambda x: x.sample(n=n_per_class, random_state=42))


Unnamed: 0,target,ids,date,flag,user,text
239563,0,1983404733,Sun May 31 13:05:32 PDT 2009,NO_QUERY,jessmeccuhlee,Just gave away my baby chicken
144462,0,2203344883,Tue Jun 16 23:32:04 PDT 2009,NO_QUERY,cassidymckinney,my 16 day vacation ends in 11.5 hours
316471,0,2202350259,Tue Jun 16 21:38:53 PDT 2009,NO_QUERY,justincourt,@beaniewasmyname hahaha that's what i made it ...
130556,0,2195145379,Tue Jun 16 10:44:23 PDT 2009,NO_QUERY,ashleymilli,1 exam down 3 to go advanced functions tomorr...
60114,0,1963419701,Fri May 29 12:41:45 PDT 2009,NO_QUERY,Leahstaplehurst,oh no noisy family who live next door are back...


##Stemming

Reducing a word to its root word (different words mean the same thing)

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content) #removing non alphabetical characters
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [None]:
#30-50 minutes if not down sized

twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text,stemmed_content
239563,0,1983404733,Sun May 31 13:05:32 PDT 2009,NO_QUERY,jessmeccuhlee,Just gave away my baby chicken,gave away babi chicken
144462,0,2203344883,Tue Jun 16 23:32:04 PDT 2009,NO_QUERY,cassidymckinney,my 16 day vacation ends in 11.5 hours,day vacat end hour
316471,0,2202350259,Tue Jun 16 21:38:53 PDT 2009,NO_QUERY,justincourt,@beaniewasmyname hahaha that's what i made it ...,beaniewasmynam hahaha made sound like got hair...
130556,0,2195145379,Tue Jun 16 10:44:23 PDT 2009,NO_QUERY,ashleymilli,1 exam down 3 to go advanced functions tomorr...,exam go advanc function tomorrow eep
60114,0,1963419701,Fri May 29 12:41:45 PDT 2009,NO_QUERY,Leahstaplehurst,oh no noisy family who live next door are back...,oh noisi famili live next door back holiday go...


In [None]:
#separating data and label

X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [None]:
#splitting data into training data and test data

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2 )

print(X_train)
print(X_test)

['go excit week david cook manila yippe' 'ryn look cute kid'
 'want work either' ... 'follow remov'
 'miss michigan girl mother say complet innoc bodi found http shar es otco tcot innoc child rip'
 'spendin time bff erin missin boo']
['get recov long night fun yesterday'
 'still twitter wtf close tweetdeck tempt bye'
 'take foreverrr jess get im bore' ...
 'himynameisemmi omg ok ty look couldnt find im comp rn idk dont wanna miss boy'
 'pearlster move'
 'last year time partyin eddiedashmac uncl crib like alll time lol good time']


In [None]:
#converting textual data to numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2364325 stored elements and shape (320000, 180075)>
  Coords	Values
  (0, 58762)	0.18124071666725988
  (0, 49159)	0.30186674320449075
  (0, 171249)	0.2618112340899794
  (0, 36445)	0.38297335155631695
  (0, 31560)	0.3687478219179311
  (0, 97502)	0.4817000920484196
  (0, 177710)	0.5411142456287595
  (1, 135866)	0.7709664308157482
  (1, 93452)	0.30126704195081005
  (1, 34188)	0.40150276557022463
  (1, 84201)	0.39197507730581055
  (2, 170377)	0.48617755519410377
  (2, 174194)	0.44160253704916086
  (2, 45374)	0.7540680235212397
  (3, 179973)	0.8347880355736645
  (3, 60208)	0.31578757797931156
  (3, 110296)	0.4510068084375472
  (4, 58762)	0.31032318670328696
  (4, 38648)	0.7696067167875927
  (4, 142466)	0.5580367561995706
  (5, 111206)	0.20834353841433995
  (5, 51864)	0.2543002817867189
  (5, 111891)	0.21070530165782203
  (5, 166586)	0.27774745198445866
  (5, 130589)	0.3728241997704869
  :	:
  (319996, 94581)	0.8157596258997524
  

# Training the machine learning model


Logistic regression

In [None]:
model = LogisticRegression(max_iter = 1000)

In [None]:
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [None]:
print('Accuracy score on training data = ', training_data_accuracy)

Accuracy score on training data =  0.81184375


In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [None]:
print('Accuracy score on testing data = ', test_data_accuracy)

Accuracy score on testing data =  0.7697


Model accuracy ~77%

Saving trained model

In [None]:
import pickle

In [None]:
filename='trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

Using saved model

In [None]:
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))

In [None]:
X_new = X_test[200]
print(Y_test[200])

prediction = loaded_model.predict(X_new)
print(prediction)


if prediction[0] == 0:
  print('Negative tweet')
else:
  print('Positive tweet')

0
[0]
Negative tweet
