# Importing libraries

In [1]:
# utilities
import re
import numpy as np
import pandas as pd
# plotting
import seaborn as sns
# from wordcloud import WordCloud
import matplotlib.pyplot as plt
# nltk
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

# Reading the Data

In [2]:
DATASET_COLUMNS = ['sentiment','id','date_time','query','user_id','tweet']
DATASET_ENCODING = 'latin'
ENGINE = 'python'
raw_data = pd.read_csv("../Data/training.1600000.processed.noemoticon.csv" , encoding=DATASET_ENCODING,
                       names=DATASET_COLUMNS ,engine=ENGINE)
raw_data['tweet'] = raw_data['tweet'].astype(str)
raw_data.head()

Unnamed: 0,sentiment,id,date_time,query,user_id,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Data Cleaning

In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   id         1600000 non-null  int64 
 2   date_time  1600000 non-null  object
 3   query      1600000 non-null  object
 4   user_id    1600000 non-null  object
 5   tweet      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [4]:
df = raw_data.drop(['id','date_time','query','user_id'], axis=1)
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   tweet      1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [6]:
df['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

In [7]:
df['sentiment'] = df['sentiment'].replace(4,1)

In [8]:
df.shape

(1600000, 2)

In [9]:
df['sentiment'].value_counts()

0    800000
1    800000
Name: sentiment, dtype: int64

In [10]:
df_positive = df[df['sentiment'] == 1]
df_negative = df[df['sentiment'] == 0]

In [11]:
df_positive = df_positive.iloc[:int(5000)]
df_negative = df_negative.iloc[:int(5000)]

In [12]:
dataset = pd.concat([df_positive, df_negative], ignore_index=True)
dataset.tweet= dataset.tweet.astype(str)


In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  10000 non-null  int64 
 1   tweet      10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [14]:
dataset.head()

Unnamed: 0,sentiment,tweet
0,1,I LOVE @Health4UandPets u guys r the best!!
1,1,im meeting up with one of my besties tonight! ...
2,1,"@DaRealSunisaKim Thanks for the Twitter add, S..."
3,1,Being sick can be really cheap when it hurts t...
4,1,@LovesBrooklyn2 he has that effect on everyone


## Removing Emails

In [15]:
dataset['tweet'] = dataset['tweet'].apply(lambda x: re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', "", x))

## Removing URL

In [16]:
def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
dataset['tweet'] = dataset['tweet'].apply(lambda x: cleaning_URLs(x))
dataset['tweet'].tail()

9995                              long day today 
9996               a friend broke his promises.. 
9997         @gjarnling I am fine thanks - tired 
9998    trying to keep my eyes open..damn baking 
9999                  why the hell is it snowing 
Name: tweet, dtype: object

## Removing Special Characters

In [17]:
def Removing_Special_characters(data):
    return re.sub('[^\w]+', ' ', data)
dataset['tweet'] = dataset['tweet'].apply(lambda x: Removing_Special_characters(x))
dataset['tweet'].tail()

9995                             long day today 
9996                a friend broke his promises 
9997           gjarnling I am fine thanks tired 
9998    trying to keep my eyes open damn baking 
9999                 why the hell is it snowing 
Name: tweet, dtype: object

## Removing Numbers

In [18]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
dataset['tweet'] = dataset['tweet'].apply(lambda x: cleaning_numbers(x))
dataset['tweet'].tail()

9995                             long day today 
9996                a friend broke his promises 
9997           gjarnling I am fine thanks tired 
9998    trying to keep my eyes open damn baking 
9999                 why the hell is it snowing 
Name: tweet, dtype: object

## Stemming and stopword removal

In [19]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

## Bag of words

In [20]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [21]:
cv.vocabulary_

{'love': 1305,
 'guy': 970,
 'best': 203,
 'im': 1100,
 'meet': 1364,
 'one': 1546,
 'besti': 204,
 'tonight': 2234,
 'cant': 337,
 'wait': 2357,
 'girl': 909,
 'talk': 2146,
 'thank': 2180,
 'twitter': 2287,
 'add': 17,
 'got': 935,
 'show': 1929,
 'dc': 552,
 'area': 107,
 'sweetheart': 2130,
 'sick': 1935,
 'realli': 1763,
 'cheap': 376,
 'hurt': 1084,
 'much': 1452,
 'eat': 679,
 'real': 1759,
 'food': 837,
 'plu': 1652,
 'friend': 866,
 'make': 1330,
 'soup': 2023,
 'effect': 686,
 'everyon': 731,
 'tell': 2169,
 'laugh': 1238,
 'loud': 1303,
 'come': 441,
 'hill': 1036,
 'respons': 1802,
 'alreadi': 58,
 'find': 809,
 'answer': 85,
 'jealou': 1158,
 'hope': 1058,
 'great': 948,
 'time': 2217,
 'vega': 2336,
 'like': 1269,
 'tommcfli': 2230,
 'ah': 32,
 'congrat': 464,
 'mr': 1445,
 'final': 808,
 'join': 1174,
 'respond': 1801,
 'stupid': 2096,
 'cat': 352,
 'help': 1025,
 'type': 2293,
 'error': 717,
 'crazi': 503,
 'day': 550,
 'school': 1865,
 'hour': 1066,
 'watch': 2370,
 't

In [22]:
y=pd.get_dummies(dataset['sentiment'])
y=y.iloc[:,1].values

## Building Model

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, y_train)

y_pred=model.predict(X_test)

In [24]:
## Import library to check accuracy
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

matrix=confusion_matrix(y_test,y_pred)
print(matrix)
score=accuracy_score(y_test,y_pred)
print(score)
report=classification_report(y_test,y_pred)
print(report)

[[712 300]
 [278 710]]
0.711
              precision    recall  f1-score   support

           0       0.72      0.70      0.71      1012
           1       0.70      0.72      0.71       988

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000

