In [1]:
import pandas as pd
import numpy as np
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
df = pd.read_csv('Tweets.csv')

In [3]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.70306e+17,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,24/02/2015 11:35,,Eastern Time (US & Canada)
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,24/02/2015 11:15,,Pacific Time (US & Canada)
2,5.70301e+17,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,24/02/2015 11:15,Lets Play,Central Time (US & Canada)
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,24/02/2015 11:15,,Pacific Time (US & Canada)
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,24/02/2015 11:14,,Pacific Time (US & Canada)


In [4]:
df = df[['text','negativereason','airline_sentiment']]

corpus - vocabulary of words.

In [5]:
df.head()

Unnamed: 0,text,negativereason,airline_sentiment
0,@VirginAmerica What @dhepburn said.,,neutral
1,@VirginAmerica plus you've added commercials t...,,positive
2,@VirginAmerica I didn't today... Must mean I n...,,neutral
3,@VirginAmerica it's really aggressive to blast...,Bad Flight,negative
4,@VirginAmerica and it's a really big bad thing...,Can't Tell,negative


In [6]:
df['Tweet'] = df['text'] + df['negativereason'].fillna(' ')

In [7]:
df.head()


Unnamed: 0,text,negativereason,airline_sentiment,Tweet
0,@VirginAmerica What @dhepburn said.,,neutral,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials t...,,positive,@VirginAmerica plus you've added commercials t...
2,@VirginAmerica I didn't today... Must mean I n...,,neutral,@VirginAmerica I didn't today... Must mean I n...
3,@VirginAmerica it's really aggressive to blast...,Bad Flight,negative,@VirginAmerica it's really aggressive to blast...
4,@VirginAmerica and it's a really big bad thing...,Can't Tell,negative,@VirginAmerica and it's a really big bad thing...


In [8]:
df.drop('text', axis = 1, inplace = True)
df.drop('negativereason', axis = 1, inplace = True)
df.head()

Unnamed: 0,airline_sentiment,Tweet
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [9]:
def text_cleaner(text):
  words = re.sub('[^a-zA-Z]', ' ', text)
  words = words.lower()
  words = words.split()
  stop_words = set(stopwords.words('english'))
  words = [word for word in words if word not in stop_words]
  return ' '.join(words)

In [10]:
df['cleaned_tweet'] = df['Tweet'].apply(lambda x:text_cleaner(x))

In [11]:
df.head()

Unnamed: 0,airline_sentiment,Tweet,cleaned_tweet
0,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,positive,@VirginAmerica plus you've added commercials t...,virginamerica plus added commercials experienc...
2,neutral,@VirginAmerica I didn't today... Must mean I n...,virginamerica today must mean need take anothe...
3,negative,@VirginAmerica it's really aggressive to blast...,virginamerica really aggressive blast obnoxiou...
4,negative,@VirginAmerica and it's a really big bad thing...,virginamerica really big bad thing itcan tell


In [12]:
label = LabelEncoder()
df['airline_sentiment'] = label.fit_transform(df['airline_sentiment'])

In [13]:
df.head()

Unnamed: 0,airline_sentiment,Tweet,cleaned_tweet
0,1,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,2,@VirginAmerica plus you've added commercials t...,virginamerica plus added commercials experienc...
2,1,@VirginAmerica I didn't today... Must mean I n...,virginamerica today must mean need take anothe...
3,0,@VirginAmerica it's really aggressive to blast...,virginamerica really aggressive blast obnoxiou...
4,0,@VirginAmerica and it's a really big bad thing...,virginamerica really big bad thing itcan tell


In [14]:
# feautre and target
x = df['cleaned_tweet']
y = df['airline_sentiment']



In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 42)

In [16]:
# applying countVectorizer
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)

In [17]:
svc = SVC()
svc.fit(x_train_cv,y_train)
y_pred = svc.predict(x_test_cv)

In [18]:
accuracy_score(y_test,y_pred)

0.9023224043715847

In [19]:
# applying TfIdf vectorizer
tfidf = TfidfVectorizer()
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [20]:
svc = SVC()
svc.fit(x_train_tfidf,y_train)
y_pred = svc.predict(x_test_tfidf)

In [21]:
accuracy_score(y_test,y_pred)

0.9000455373406193

In [22]:
# what is working diff between countVectorizer and Tfidf Vectorizer

In [23]:
import tensorflow