#Read the dataset

In [None]:
import pandas as pd



In [None]:
df = pd.read_csv('Tweets.csv')

In [None]:
df.head(5)

df.shape

(14640, 15)

In [None]:
df = df[['airline_sentiment','text']]

In [None]:
df.sample(5)

Unnamed: 0,airline_sentiment,text
584,negative,"@united as a 1k, I'm always hoping for improve..."
1036,positive,@united flight 1491...plane from SFO to DEN is...
1238,negative,@united working with Lisa J at ORD. she's work...
2503,positive,@united Very impressed so far. An app that's w...
3581,negative,"@united lost my parents luggage to cancun, sai..."


# Preprocessing


In [None]:
import nltk
import re
import string

from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
ps = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
  text = nltk.word_tokenize(text)

  y=[]
  for i in text:
    if i not in stopwords.words('english'):
      y.append(i)

  text=y[:]
  y.clear()

  for i in text:
    y.append(ps.stem(i))

  return " ".join(y)

In [None]:
df['text_cleaned'] =  df['text'].apply(clean_text)

In [None]:
df.head(5)

Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,@ virginamerica @ dhepburn said .
1,positive,@VirginAmerica plus you've added commercials t...,@ virginamerica plu 've ad commerci experi ......
2,neutral,@VirginAmerica I didn't today... Must mean I n...,@ virginamerica n't today ... must mean need t...
3,negative,@VirginAmerica it's really aggressive to blast...,@ virginamerica 's realli aggress blast obnoxi...
4,negative,@VirginAmerica and it's a really big bad thing...,@ virginamerica 's realli big bad thing


#Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_vec = TfidfVectorizer(max_features=3000)
X = tf_vec.fit_transform(df['text_cleaned']).toarray()

In [None]:
X.shape

(14640, 3000)

In [None]:
Y = df['airline_sentiment'].values

#Train the model

In [None]:
from sklearn.model_selection import train_test_split
X_train,x_test,Y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train,Y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(x_test)
accuracy_score(y_test,y_pred)

0.7219945355191257

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train,Y_train)

In [None]:
y_predict = model.predict(x_test)
print(accuracy_score(y_test,y_predict))

0.7496584699453552


In [None]:
df.groupby('airline_sentiment').describe()

Unnamed: 0_level_0,text,text,text,text,text_cleaned,text_cleaned,text_cleaned,text_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
airline_sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,9178,9087,@AmericanAir that's 16+ extra hours of travel ...,2,9178,9083,@ americanair fyi ... call still get drop . ho...,2
neutral,3099,3067,@SouthwestAir sent,5,3099,3025,@ jetblu 's ceo battl appeas passeng wall stre...,8
positive,2363,2298,@JetBlue thanks!,5,2363,2262,@ jetblu thank !,12
