<a href="https://colab.research.google.com/github/RishiNandhan/US-airlines-tweets-sentiment-analysis-prediction/blob/master/US_airlines_Tweets_sentiment_analysis_%26_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import all the necessary packages
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
#set the display option to display all rows and columns
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)


In [3]:
#Load the data
data=pd.read_csv("Tweets.csv")
print(data.info())  #check for the info
print(data.shape)   #get the dimensions of the data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
tweet_id                        14640 non-null int64
airline_sentiment               14640 non-null object
airline_sentiment_confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason_confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 non-null object
tweet_location                  9907 non-null object
user_timezone                   9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB
None
(14640, 15)


In [4]:
#get the unique airline counts
airline_count=data['airline'].value_counts().sort_values(ascending=False)
print(airline_count)


United            3822
US Airways        2913
American          2759
Southwest         2420
Delta             2222
Virgin America     504
Name: airline, dtype: int64


In [5]:
#percentage of tweets in each airline
tweet_percentage=(data[['airline','text']].groupby('airline').count()/len(data)).sort_values(by='text',ascending=False)
print(tweet_percentage)


                    text
airline                 
United          0.261066
US Airways      0.198975
American        0.188456
Southwest       0.165301
Delta           0.151776
Virgin America  0.034426


In [6]:
#tweets count for each airline and airline_sentiment
print(pd.crosstab(data.airline,data.airline_sentiment))


airline_sentiment  negative  neutral  positive
airline                                       
American               1960      463       336
Delta                   955      723       544
Southwest              1186      664       570
US Airways             2263      381       269
United                 2633      697       492
Virgin America          181      171       152


In [7]:
#create a seperate df for tweets
tweets=data['text'].values
tweets_df=pd.DataFrame(data={'tweets':tweets},index=np.arange(len(tweets)))
print(tweets_df.head())


                                              tweets
0                @VirginAmerica What @dhepburn said.
1  @VirginAmerica plus you've added commercials t...
2  @VirginAmerica I didn't today... Must mean I n...
3  @VirginAmerica it's really aggressive to blast...
4  @VirginAmerica and it's a really big bad thing...


In [8]:
#get the no of words in each tweet
tweets_df['no_of_words']=tweets_df['tweets'].apply(lambda x: len(str(x).split()))
print(tweets_df.head())

                                              tweets  no_of_words
0                @VirginAmerica What @dhepburn said.            4
1  @VirginAmerica plus you've added commercials t...            9
2  @VirginAmerica I didn't today... Must mean I n...           12
3  @VirginAmerica it's really aggressive to blast...           17
4  @VirginAmerica and it's a really big bad thing...           10


In [9]:
#get the no of characters in each tweet
tweets_df['no_of_chars']=tweets_df['tweets'].apply(lambda x: len(str(x)))
print(tweets_df.head())


                                              tweets  no_of_words  no_of_chars
0                @VirginAmerica What @dhepburn said.            4           35
1  @VirginAmerica plus you've added commercials t...            9           72
2  @VirginAmerica I didn't today... Must mean I n...           12           71
3  @VirginAmerica it's really aggressive to blast...           17          126
4  @VirginAmerica and it's a really big bad thing...           10           55


In [10]:
#get the avg word length in each tweet
tweets_df['avg_word_length']=tweets_df['no_of_chars']/tweets_df['no_of_words']
print(tweets_df.head())



                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  
0           35         8.750000  
1           72         8.000000  
2           71         5.916667  
3          126         7.411765  
4           55         5.500000  


In [11]:
#get the stop words from NLTK package
stop=stopwords.words('english')
print(len(stop))
print(stop)


179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [12]:
#get the no of stop words in each tweet
tweets_df['no_of_stopwords']=tweets_df['tweets'].apply(lambda x: len([x for x in str(x).split() if x in stop]) )
print(tweets_df.head())


                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  no_of_stopwords  
0           35         8.750000                0  
1           72         8.000000                3  
2           71         5.916667                2  
3          126         7.411765                6  
4           55         5.500000                5  


In [14]:
#get the no of numeric in each tweets
tweets_df['no_of_numeric']=tweets_df['tweets'].apply(lambda x: len([x for x in str(x).split() if x.isnumeric()]))
print(tweets_df.head())


                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  no_of_stopwords  no_of_numeric  
0           35         8.750000                0              0  
1           72         8.000000                3              0  
2           71         5.916667                2              0  
3          126         7.411765                6              0  
4           55         5.500000                5              0  


In [15]:
#get the no of hashtags in each tweet
tweets_df['no_of_hashtag']=tweets_df['tweets'].apply(lambda x: len([x for x in str(x).split() if x.startswith("#")]))
print(tweets_df.head())



                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  no_of_stopwords  no_of_numeric  no_of_hashtag  
0           35         8.750000                0              0              0  
1           72         8.000000                3              0              0  
2           71         5.916667                2              0              0  
3          126         7.411765                6              0              0  
4           55         5.500000                5              0              0  


In [16]:
#remove stop words
tweets_df['tweets_cleaned']=tweets_df['tweets'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print(tweets_df.head())


                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  no_of_stopwords  no_of_numeric  \
0           35         8.750000                0              0   
1           72         8.000000                3              0   
2           71         5.916667                2              0   
3          126         7.411765                6              0   
4           55         5.500000                5              0   

   no_of_hashtag                                     tweets_cleaned  
0              0                @VirginAmerica What @dhepburn said.  
1              0  @VirginAmerica plus adde

In [17]:
#change the tweets to lowercase
tweets_df['tweets_cleaned']=tweets_df['tweets_cleaned'].apply(lambda x: " ".join(x.lower() for x in x.split()))
print(tweets_df.head())



                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  no_of_stopwords  no_of_numeric  \
0           35         8.750000                0              0   
1           72         8.000000                3              0   
2           71         5.916667                2              0   
3          126         7.411765                6              0   
4           55         5.500000                5              0   

   no_of_hashtag                                     tweets_cleaned  
0              0                @virginamerica what @dhepburn said.  
1              0  @virginamerica plus adde

In [18]:
#remove puntuations
tweets_df['tweets_cleaned']=tweets_df['tweets_cleaned'].apply(lambda x: x.translate(str.maketrans("","",string.punctuation)))
print(tweets_df.head())




                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  no_of_stopwords  no_of_numeric  \
0           35         8.750000                0              0   
1           72         8.000000                3              0   
2           71         5.916667                2              0   
3          126         7.411765                6              0   
4           55         5.500000                5              0   

   no_of_hashtag                                     tweets_cleaned  
0              0                   virginamerica what dhepburn said  
1              0  virginamerica plus added

In [19]:
# count each words in the dataset
count_of_each_word=pd.Series(" ".join(tweets_df['tweets_cleaned']).split()).value_counts()
print(count_of_each_word[:20])
#remove top commonly used words
remove_common_words=count_of_each_word[:7]
tweets_df['tweets_cleaned']=tweets_df['tweets_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in remove_common_words))



i               5075
united          4117
flight          3870
usairways       3039
americanair     2938
southwestair    2441
jetblue         2256
get             1334
thanks          1071
cancelled       1056
service          953
you              873
help             852
time             769
customer         746
im               740
us               678
hours            669
flights          645
2                644
dtype: int64


In [20]:
#remove rarely used words
print(count_of_each_word[-20:])
remove_rare_words=count_of_each_word[-20:]
tweets_df['tweets_cleaned']=tweets_df['tweets_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in remove_rare_words))



lotbut               1
s4                   1
psgrs                1
n615jb               1
cheatcustomers       1
viraltech            1
dimensions           1
enforcement          1
nycbuenos            1
seanmfmadden         1
bad👎👎                1
herself              1
httptco5txu5tsfkj    1
ils                  1
angriest             1
princesses           1
powered              1
compact              1
diet                 1
23oct                1
dtype: int64


In [21]:
#Lemantize each word
tweets_df['tweets_cleaned']=tweets_df['tweets_cleaned'].apply(lambda x: " ".join([Word(x).lemmatize() for x in x.split()]))
print(tweets_df.head())



                                              tweets  no_of_words  \
0                @VirginAmerica What @dhepburn said.            4   
1  @VirginAmerica plus you've added commercials t...            9   
2  @VirginAmerica I didn't today... Must mean I n...           12   
3  @VirginAmerica it's really aggressive to blast...           17   
4  @VirginAmerica and it's a really big bad thing...           10   

   no_of_chars  avg_word_length  no_of_stopwords  no_of_numeric  \
0           35         8.750000                0              0   
1           72         8.000000                3              0   
2           71         5.916667                2              0   
3          126         7.411765                6              0   
4           55         5.500000                5              0   

   no_of_hashtag                                     tweets_cleaned  
0              0                   virginamerica what dhepburn said  
1              0  virginamerica plus added

Spell check requires lot of computation power and it is very time consuming. 

In [0]:
#spell check
tweets_df['tweets_cleaned']=tweets_df['tweets_cleaned'].apply(lambda x: str(TextBlob(x).correct()))
print(tweets_df.head())



In [0]:
#change to Tf-Idf vectorization
vectorizer=TfidfVectorizer(max_features=3000,min_df=7,max_df=0.8)
processed_features=vectorizer.fit_transform(tweets_df['tweets_cleaned']).toarray()


In [0]:
#split the data into train and test
x_train, x_test, y_train, y_test=train_test_split(processed_features,data['airline_sentiment'],test_size=0.2,random_state=21)





In [24]:
#Random Forest Classifier
text_classifier = RandomForestClassifier(n_estimators=200, random_state=21)
text_classifier.fit(x_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=21, verbose=0,
                       warm_start=False)

In [0]:
#make prediction using Random Forest
predictions=text_classifier.predict(x_test)



In [28]:
#evaluate using metrics
print("\n CONFUSION MATRIX \n",confusion_matrix(y_test,predictions))
print("\n CLASSIFICATION REPORT\n",classification_report(y_test,predictions))
print("\n ACCURACY SCORE:",accuracy_score(y_test, predictions))




 CONFUSION MATRIX 
 [[1682  101   47]
 [ 270  301   61]
 [ 140   64  262]]

 CLASSIFICATION REPORT
               precision    recall  f1-score   support

    negative       0.80      0.92      0.86      1830
     neutral       0.65      0.48      0.55       632
    positive       0.71      0.56      0.63       466

    accuracy                           0.77      2928
   macro avg       0.72      0.65      0.68      2928
weighted avg       0.75      0.77      0.75      2928


 ACCURACY SCORE: 0.7667349726775956


In [29]:
#Gaussian NaiveBayes Classifier
NB=GaussianNB()
NB.fit(x_train,y_train)


GaussianNB(priors=None, var_smoothing=1e-09)

In [0]:
#make prediction using Gaussian NaiveBayes Classifier
nb_prediction=NB.predict(x_test)



In [31]:
print("\n CONFUSION MATRIX \n",confusion_matrix(y_test,nb_prediction))
print("\n CLASSIFICATION REPORT\n",classification_report(y_test,nb_prediction))
print("\n ACCURACY SCORE:",accuracy_score(y_test, nb_prediction))



 CONFUSION MATRIX 
 [[635 467 728]
 [ 50 198 384]
 [ 37  67 362]]

 CLASSIFICATION REPORT
               precision    recall  f1-score   support

    negative       0.88      0.35      0.50      1830
     neutral       0.27      0.31      0.29       632
    positive       0.25      0.78      0.37       466

    accuracy                           0.41      2928
   macro avg       0.47      0.48      0.39      2928
weighted avg       0.65      0.41      0.43      2928


 ACCURACY SCORE: 0.40812841530054644
