In [None]:
#installing kaggle library
! pip install kaggle



In [None]:
#configuring the path of kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


**Importing the data set**

In [None]:
# Fetching the dataset from kaggle using API
!kaggle datasets download -d kazanova/sentiment140

Downloading sentiment140.zip to /content
100% 80.9M/80.9M [00:00<00:00, 204MB/s]
100% 80.9M/80.9M [00:00<00:00, 183MB/s]


In [None]:
# Extraxcting the dataset from zipfile

from zipfile import ZipFile

dataset = '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as zip:
    zip.extractall()

In [None]:
#importing libraries

import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**Data Processing**

In [None]:
#loading data to pandas
twitter_data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1")

In [None]:
twitter_data.shape

(1599999, 6)

In [None]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [None]:
#naming the columns
column_names = ['target','id','date','flag','user','text']

In [None]:
#loading data to pandas
twitter_data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv",names = column_names, encoding = "ISO-8859-1")

In [None]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
twitter_data.shape

(1600000, 6)

In [None]:
twitter_data['target'].unique()

array([0, 4])

In [None]:
#counting the missing value
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [None]:
twitter_data['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [None]:
twitter_data.replace({'target':{4:1}},inplace=True)

In [None]:
twitter_data['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

**Stemming**

In [None]:
snowballstem = SnowballStemmer('english')

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def stemming(sentences):
    sentences = re.sub('[^a-zA-Z]'," ", sentences)
    sentences = sentences.lower()
    words = nltk.word_tokenize(sentences)
    words = [snowballstem.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences = " ".join(words)
    return sentences


In [None]:
twitter_data['stemmed_text'] = twitter_data['text'].apply(stemming)

In [62]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [64]:
#seperating the dependent and independent variable
x = twitter_data['stemmed_text'].values
y = twitter_data['target'].values

In [65]:
#split the data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, stratify=y)

In [66]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

**Logistic Regression**

In [67]:
model = LogisticRegression()

In [68]:
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [70]:
x_train_pred = model.predict(x_train)
training_accuracy = accuracy_score(y_train,x_train_pred)
print("Training Dataset accuracy:", training_accuracy)

Training Dataset accuracy: 0.8091109375


In [71]:
x_test_pred = model.predict(x_test)
testing_accurcay = accuracy_score(y_test, x_test_pred)
print("Testing Dataset Accuracy:", testing_accurcay)

Testing Dataset Accuracy: 0.77849375


**Saving the model**

In [72]:
import pickle

In [73]:
filename = "twitter_sentiment_analysis_model.sav"
pickle.dump(model, open(filename,"wb"))

In [75]:
filename = "twitter_sentiment_analysis_model_v2.pk1"
pickle.dump(model, open(filename,"wb"))

**Using the model for furture prediction**

In [74]:
#laoding the model
load_model = pickle.load(open("/content/twitter_sentiment_analysis_model.sav",'rb'))