# **Objective: To Preprocess Text data from Twitter comments to aid in sentiment analysis**

In [0]:
import sklearn
import pandas as pd
#import nltk as nl
import re
import numpy as np
from google.colab import files

In [3]:
uploaded = files.upload()

Saving Tweets.csv to Tweets.csv


In [4]:
raw_data=pd.read_csv("Tweets.csv",usecols=['airline_sentiment','text'])
raw_data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


## **1. Text Pre-processing and cleaning words not suitable for classification**

In [5]:
#1.1 Removing twitter handles
raw_data['text'] = raw_data['text'].str.replace("@[\w]*"," ")
#1.2 Converting into lower case
raw_data['text'] = raw_data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#1.3 Removing punctuations
raw_data['text'] = raw_data['text'].str.replace("[^a-zA-Z#]", " ")
#1.4 Removing 1 and 2 letter words
raw_data['text'] = raw_data['text'].str.replace(r"\b[a-zA-Z]\b", " ")
raw_data['text'] = raw_data['text'].str.replace(r"\b[a-zA-Z]{2}\b", " ")
raw_data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,what said
1,positive,plus you added commercials the experience ...
2,neutral,didn today must mean need take anot...
3,negative,really aggressive blast obnoxious enter...
4,negative,and really big bad thing about


In [6]:
#1.5 Removing stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
#print(stop)
raw_data['text'] = raw_data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
raw_data['text'].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0                                                 said
1              plus added commercials experience tacky
2               today must mean need take another trip
3    really aggressive blast obnoxious entertainmen...
4                                 really big bad thing
Name: text, dtype: object

In [7]:
#1.6 Rare words removal ( frequency of 20 words removal based on their utility for classification)

freq = pd.Series(' '.join(raw_data['text']).split()).value_counts()[-20:]
freq = list(freq.index)
raw_data['text'] = raw_data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
raw_data['text'].head()

0                                                 said
1              plus added commercials experience tacky
2               today must mean need take another trip
3    really aggressive blast obnoxious entertainmen...
4                                 really big bad thing
Name: text, dtype: object

In [8]:
#1.7 Stemming
from nltk.stem.porter import *
st = PorterStemmer()

raw_data['text'] = raw_data['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
raw_data.head()


Unnamed: 0,airline_sentiment,text
0,neutral,said
1,positive,plu ad commerci experi tacki
2,neutral,today must mean need take anoth trip
3,negative,realli aggress blast obnoxi entertain guest fa...
4,negative,realli big bad thing


In [9]:
#1.8 Lemmazation

nltk.download('wordnet')
from textblob import Word
raw_data['text']= raw_data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
raw_data['text'].head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


0                                                 said
1                         plu ad commerci experi tacki
2                 today must mean need take anoth trip
3    realli aggress blast obnoxi entertain guest fa...
4                                 realli big bad thing
Name: text, dtype: object

In [0]:
#1.9 Creating numeric labels
raw_data.loc[raw_data['airline_sentiment'] == 'neutral', 'new'] = 0
raw_data.loc[raw_data['airline_sentiment'] == 'positive', 'new'] = 1
raw_data.loc[raw_data['airline_sentiment'] == 'negative', 'new'] = 2
raw_data['labl']=raw_data['new'].apply(np.int64)

## **Bag of words**

In [0]:
# bag-of-words feature matrix
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer()
bagof = bow_vectorizer.fit_transform(raw_data['text']).toarray()

In [0]:
# Dividing the data into train data and test data
import sklearn
from sklearn.model_selection import train_test_split
train, test, train_labels, test_labels = train_test_split(bagof,
                                                          raw_data['labl'],
                                                          test_size=0.33)

# **2. Building Model**

**2.1 Cross validation - Naive Bayes**

In [13]:
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

scores_2 = cross_val_score(gnb, train, train_labels, cv=5)
print(scores_2)  
print(np.mean(scores_2))
  
#Testing with test data  
model = gnb.fit(train, train_labels)
pred1 = gnb.predict(test)
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, pred1))


[0.46432212 0.45158002 0.45667686 0.45792963 0.46098929]
0.4582995839876573
0.4459850993377483


** 2.2 Cross validation - decision Tree**

In [14]:
  from sklearn.model_selection import cross_val_score
  from sklearn.tree import DecisionTreeClassifier
  import numpy as np
  from sklearn.metrics import accuracy_score
  
  clf = DecisionTreeClassifier(random_state=17,max_depth=1)
  scores_1 = cross_val_score(clf, train, train_labels, cv=5)
  print(scores_1)  
  print(np.mean(scores_1))
  
  #Testing with test data 
  dtcmodel = clf.fit(train, train_labels)
  pred4 = clf.predict(test)
  print(accuracy_score(test_labels, pred4))

[0.67176351 0.67278287 0.67635066 0.67108618 0.67108618]
0.6726138809746218
0.6684602649006622
