# Read the data set using pandas library

In [5]:
import pandas as pd

df = pd.read_csv('Tweets.csv' , encoding = 'latin-1')
df.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [6]:
df.shape

(14640, 15)

# drop columns except airline_sentiment and text

In [7]:
df = df[["airline_sentiment", "text"]]

In [8]:
df.shape

(14640, 2)

In [9]:
df.head(5)

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


# preprocess text

In [10]:
import nltk
import string
import re

from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
ps = PorterStemmer()

def clean_text(text):

    text = text.lower()
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
      if i not in stopwords.words('english'):
         y.append(i)
      text = y[:]

      y.clear()

    for i in text:
      y.append(ps.stem(i))

    return" ".join(y)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Apply “clean_text” function to “text” column and assign the resulting text to new column named “text_cleaned”.


In [11]:
df['text_cleaned'] = df['text'].apply(clean_text)

In [12]:
df.head(5)

Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,.
1,positive,@VirginAmerica plus you've added commercials t...,.
2,neutral,@VirginAmerica I didn't today... Must mean I n...,!
3,negative,@VirginAmerica it's really aggressive to blast...,recours
4,negative,@VirginAmerica and it's a really big bad thing...,


In [13]:
df.groupby('airline_sentiment').describe()

Unnamed: 0_level_0,text,text,text,text,text_cleaned,text_cleaned,text_cleaned,text_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
airline_sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,9178,9087,@AmericanAir that's 16+ extra hours of travel ...,2,9178,1783,.,2631
neutral,3099,3067,@SouthwestAir sent,5,3099,795,?,789
positive,2363,2298,@JetBlue thanks!,5,2363,607,!,717


In [14]:
df = df.drop_duplicates(keep = 'first')

In [15]:
df.shape

(14452, 3)

In [16]:
df.groupby('airline_sentiment').describe()

Unnamed: 0_level_0,text,text,text,text,text_cleaned,text_cleaned,text_cleaned,text_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
airline_sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,9087,9087,@VirginAmerica it's really aggressive to blast...,1,9087,1783,.,2602
neutral,3067,3067,@VirginAmerica What @dhepburn said.,1,3067,795,?,785
positive,2298,2298,@VirginAmerica plus you've added commercials t...,1,2298,607,!,683


# feature Extraction

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tf_vec = TfidfVectorizer(max_features = 3000)

In [19]:
X = tf_vec.fit_transform(df['text_cleaned']).toarray()

In [20]:
X.shape

(14452, 2475)

In [21]:
Y = df['airline_sentiment'].values

In [22]:
Y.shape

(14452,)

# Train model

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train ,X_test , Y_train ,  Y_test = train_test_split(X , Y , test_size = 0.2  , random_state = 2)

In [25]:
X_train.shape

(11561, 2475)

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
model = MultinomialNB()
model.fit(X_train,Y_train)

MultinomialNB()

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
print(accuracy_score(Y_test,Y_pred))

0.6399169837426496


In [30]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(X_train,Y_train)
Y_pred = model1.predict(X_test)
print(accuracy_score(Y_test,Y_pred))


0.6395710826703562
