# Airline Sentiment Notebook
*Tuyet Nguyen*


In [1]:
import luigi
import pandas as pd
import numpy as np
import ast

from scipy import sparse
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from IPython.display import display, HTML

In [2]:
tweets_df = pd.read_csv('airline_tweets.csv', encoding = "ISO-8859-1")
cities_df = pd.read_csv('cities.csv', encoding = "ISO-8859-1")

In [3]:
display(tweets_df.head())
display(tweets_df.info())

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 20 columns):
_unit_id                        14640 non-null int64
_golden                         14640 non-null bool
_unit_state                     14640 non-null object
_trusted_judgments              14640 non-null int64
_last_judgment_at               14584 non-null object
airline_sentiment               14640 non-null object
airline_sentiment:confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason:confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 

None

In [4]:
tweets_clean_df = tweets_df[['airline_sentiment', 'tweet_coord']].dropna()

In [5]:
tweets_clean_df = tweets_clean_df.loc[tweets_clean_df['tweet_coord'] != '[0.0, 0.0]']

In [6]:
display(cities_df.head())
display(cities_df.info())

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,3040051,les Escaldes,les Escaldes,"Ehskal'des-Ehndzhordani,Escaldes,Escaldes-Engo...",42.50729,1.53414,P,PPLA,AD,,8,,,,15853,,1033,Europe/Andorra,2008-10-15
1,3041563,Andorra la Vella,Andorra la Vella,"ALV,Ando-la-Vyey,Andora,Andora la Vela,Andora ...",42.50779,1.52109,P,PPLC,AD,,7,,,,20430,,1037,Europe/Andorra,2010-05-30
2,290594,Umm al Qaywayn,Umm al Qaywayn,"Oumm al Qaiwain,Oumm al QaÃ¯waÃ¯n,Um al Kawain...",25.56473,55.55517,P,PPLA,AE,,7,,,,44411,,2,Asia/Dubai,2014-10-07
3,291074,Ras al-Khaimah,Ras al-Khaimah,"Julfa,Khaimah,RKT,Ra's al Khaymah,Ra's al-Chai...",25.78953,55.9432,P,PPLA,AE,,5,,,,115949,,2,Asia/Dubai,2015-12-05
4,291696,Khawr FakkÄn,Khawr Fakkan,"Fakkan,FakkÄn,Khawr Fakkan,Khawr FakkÄn,Khaw...",25.33132,56.34199,P,PPL,AE,,6,,,,33575,,20,Asia/Dubai,2013-10-25


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23278 entries, 0 to 23277
Data columns (total 19 columns):
geonameid            23278 non-null int64
name                 23278 non-null object
asciiname            23278 non-null object
alternatenames       21281 non-null object
latitude             23278 non-null float64
longitude            23278 non-null float64
feature class        23278 non-null object
feature code         23278 non-null object
country code         23265 non-null object
cc2                  242 non-null object
admin1 code          23271 non-null object
admin2 code          16067 non-null object
admin3 code          7026 non-null object
admin4 code          2213 non-null object
population           23278 non-null int64
elevation            3686 non-null float64
dem                  23278 non-null int64
timezone             23278 non-null object
modification date    23278 non-null object
dtypes: float64(3), int64(3), object(13)
memory usage: 3.4+ MB


None

In [7]:
tweets_clean_df.head(1)

Unnamed: 0,airline_sentiment,tweet_coord
21,positive,"[40.74804263, -73.99295302]"


In [8]:
labels = {'negative': 0, 'neutral': 1, 'positive': 2}
tweets_clean_df.replace({'airline_sentiment': labels}, inplace=True)

In [9]:
def closest_node(node, nodes):
    nodes = np.asarray(nodes)
    dist_2 = np.sum((nodes - node)**2, axis=1)
    return np.argmin(dist_2)

In [10]:
cities_df[['latitude', 'longitude']].head()

closest_node([30., 30.], cities_df[['latitude', 'longitude']])

5911

In [11]:
cities_df[['latitude', 'longitude']].iloc[5911]

latitude     30.91018
longitude    30.17438
Name: 5911, dtype: float64

In [12]:
tweets_clean_df['tweet_coord'].map(ast.literal_eval).head()

21     [40.74804263, -73.99295302]
28       [42.361016, -71.02000488]
29     [33.94540417, -118.4062472]
32    [33.94209449, -118.40410103]
34       [33.2145038, -96.9321504]
Name: tweet_coord, dtype: object

In [13]:
tweets_clean_df['tweet_coord'].map(ast.literal_eval).map(lambda coords: closest_node(coords, cities_df[['latitude', 'longitude']])).map(lambda ind: cities_df[['latitude', 'longitude', 'name']].values[ind]).head()

21    [40.71427, -74.00596999999999, New York City]
28                   [42.39176, -71.03283, Chelsea]
29      [33.91918, -118.41646999999999, El Segundo]
32      [33.91918, -118.41646999999999, El Segundo]
34                    [33.15067, -96.82361, Frisco]
Name: tweet_coord, dtype: object

In [14]:
tweets_clean_df['closest_city'] = tweets_clean_df['tweet_coord'].map(ast.literal_eval).map(lambda coords: closest_node(coords, cities_df[['latitude', 'longitude']])).map(lambda ind: cities_df['name'][ind])

In [15]:
tweets_clean_df.head()

Unnamed: 0,airline_sentiment,tweet_coord,closest_city
21,2,"[40.74804263, -73.99295302]",New York City
28,0,"[42.361016, -71.02000488]",Chelsea
29,1,"[33.94540417, -118.4062472]",El Segundo
32,0,"[33.94209449, -118.40410103]",El Segundo
34,2,"[33.2145038, -96.9321504]",Frisco


In [16]:
cities_df['name'].head()

0        les Escaldes
1    Andorra la Vella
2      Umm al Qaywayn
3      Ras al-Khaimah
4       Khawr FakkÄn
Name: name, dtype: object

In [17]:
le = LabelEncoder()
cities_df['label'] = le.fit_transform(cities_df['name'])

ohe = OneHotEncoder(sparse=False)#.fit(cities_df['label'].values.reshape(-1, 1))

tweets_clean_df['closest_city_OHE'] = ohe.fit_transform(le.transform(tweets_clean_df['closest_city']).reshape(-1, 1)).tolist()

In [18]:
#tweets_clean_df.rename(columns = {'closest_city_OHE': 'X', 'airline_sentiment': 'y'})[['X', 'y']].to_csv('features.csv')


### Reading Back in the Data

In [19]:
%%time
features_df = pd.read_csv('features.csv', index_col=0)
features_df['X'] = features_df['X'].map(ast.literal_eval)

CPU times: user 764 ms, sys: 8 ms, total: 772 ms
Wall time: 838 ms


In [20]:
features_df.head()

Unnamed: 0,X,y
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2


### Training Model

Use Random Forest due to the high large-dimension of OHE input.

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
# Convert X column to a 2d-array

X_train, X_test, y_train, y_test = train_test_split(sparse.csr_matrix(features_df['X'].tolist()), features_df['y'], test_size=0.3, random_state=42)

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score

### Baseline Model

0 is the most common sentiment.

In [24]:
print('Baseline Train Score (0 Prediction): {:0.4f}'.format(accuracy_score(y_train, np.zeros(X_train.shape[0]))))
print('Baseline Test Score (0 Prediction): {:0.4f}'.format(accuracy_score(y_test, np.zeros(X_test.shape[0]))))
print('F1 Score (Macro Avg): %0.4f' % f1_score(y_test, np.zeros(X_test.shape[0]), average='macro'))
print("\n---- Baseline Classification Report ---\n")
print(classification_report(y_test,  np.zeros(X_test.shape[0])))

Baseline Train Score (0 Prediction): 0.6689
Baseline Test Score (0 Prediction): 0.6732
F1 Score (Macro Avg): 0.2682

---- Baseline Classification Report ---

             precision    recall  f1-score   support

          0       0.67      1.00      0.80       173
          1       0.00      0.00      0.00        38
          2       0.00      0.00      0.00        46

avg / total       0.45      0.67      0.54       257



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [25]:
# I use class weight to balance the imbalanced classes

rfc = RandomForestClassifier(n_estimators=100, max_depth=None, class_weight='balanced')

rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
print('Train score: %0.4f' % rfc.score(X_train, y_train))
print('Test score: %0.4f' % rfc.score(X_test, y_test))
print('F1 Score (Macro Avg): %0.4f' % f1_score(y_test, rfc.predict(X_test), average='macro'))
print("\n---- Classification Report ---\n")
print(classification_report(y_test, rfc.predict(X_test)))

Train score: 0.7341
Test score: 0.5292
F1 Score (Macro Avg): 0.3648

---- Classification Report ---

             precision    recall  f1-score   support

          0       0.68      0.69      0.68       173
          1       0.21      0.24      0.22        38
          2       0.21      0.17      0.19        46

avg / total       0.53      0.53      0.53       257



A little bit of grid search to optimize hyperparameters.

In [27]:
params = {'n_estimators':  [10, 50, 100],
                    'max_depth': [None, 5, 10],
                    'max_features': ['sqrt', 'auto'],
                    'class_weight': ['balanced', 'balanced_subsample']
                     }

In [28]:
%%time
#Each model trains pretty fast, don't specify n_jobs at model level, specify at grid-search level
rfc_GS = GridSearchCV(RandomForestClassifier(), params, verbose=1, n_jobs=4)
rfc_GS.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
CPU times: user 276 ms, sys: 116 ms, total: 392 ms
Wall time: 8.5 s


[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed:    8.2s finished


In [29]:
hyper_df = pd.DataFrame(rfc_GS.cv_results_)
display(hyper_df.sort_values('rank_test_score').head())

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_class_weight,param_max_depth,param_max_features,param_n_estimators,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
24,0.040526,0.002996,0.618729,0.678093,balanced_subsample,5,sqrt,10,"{'n_estimators': 10, 'max_depth': 5, 'max_feat...",1,0.626866,0.675063,0.613065,0.686717,0.616162,0.6725,0.004918,5.6e-05,0.005925,0.006187
6,0.044508,0.004986,0.618729,0.690627,balanced,5,sqrt,10,"{'n_estimators': 10, 'max_depth': 5, 'max_feat...",1,0.597015,0.687657,0.643216,0.689223,0.616162,0.695,0.009353,0.002626,0.018979,0.003158
27,0.035574,0.003163,0.61204,0.681454,balanced_subsample,5,auto,10,"{'n_estimators': 10, 'max_depth': 5, 'max_feat...",3,0.626866,0.697733,0.61809,0.651629,0.590909,0.695,0.001289,6.6e-05,0.015294,0.021119
9,0.033781,0.003328,0.610368,0.699882,balanced,5,auto,10,"{'n_estimators': 10, 'max_depth': 5, 'max_feat...",4,0.59204,0.722922,0.643216,0.689223,0.59596,0.6875,0.003468,0.000464,0.023253,0.016307
14,0.319713,0.029432,0.595318,0.806015,balanced,10,sqrt,100,"{'n_estimators': 100, 'max_depth': 10, 'max_fe...",5,0.567164,0.803526,0.60804,0.807018,0.611111,0.8075,0.012274,0.004117,0.020072,0.00177


In [30]:
print('Train score: %0.4f' % rfc_GS.best_estimator_.score(X_train, y_train))
print('Test score: %0.4f' % rfc_GS.best_estimator_.score(X_test, y_test))
print('F1 Score (Macro Avg): %0.4f' % f1_score(y_test, rfc_GS.best_estimator_.predict(X_test), average='macro'))
print("\n---- Classification Report ---\n")
print(classification_report(y_test, rfc_GS.best_estimator_.predict(X_test)))

Train score: 0.3679
Test score: 0.3152
F1 Score (Macro Avg): 0.2539

---- Classification Report ---

             precision    recall  f1-score   support

          0       0.76      0.24      0.36       173
          1       0.15      0.05      0.08        38
          2       0.20      0.83      0.32        46

avg / total       0.57      0.32      0.31       257



In [31]:
print(y_test.values)
print(rfc_GS.best_estimator_.predict(X_test))

[0 2 0 0 0 1 2 0 2 1 0 1 2 2 2 0 1 0 1 2 0 2 0 0 0 2 0 2 1 2 0 2 2 0 0 2 2
 2 0 0 2 1 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 2 0 2 0 0 1 0 0 1 1 0 1 1 1 1 2
 0 0 1 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 1 0 0 0 1 0 0 1 0 1 2 0 1 0 0 0 0 0
 0 0 0 0 2 0 2 1 0 0 2 0 0 0 2 0 1 0 2 0 0 0 2 2 2 0 0 0 0 0 0 2 0 2 1 2 0
 0 0 2 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 2 2
 0 1 1 0 1 0 0 0 0 0 0 2 2 0 0 0 0 0 0 1 0 0 2 2 0 0 2 2 0 0 0 0 1 2 0 0 0
 0 0 1 0 0 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[2 2 2 2 2 2 2 2 2 2 1 2 2 2 0 0 2 0 2 0 2 2 2 0 2 2 2 2 2 2 0 2 2 0 0 2 0
 2 0 2 2 2 1 2 0 0 2 0 2 2 0 2 2 2 2 1 2 2 2 2 2 2 2 0 2 0 1 2 0 0 2 2 2 0
 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 0 2 2 0 0 2 2 2 0 0 2 2 2 2 2 2 2 0 2 2
 2 2 2 2 2 2 0 2 2 1 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 1 2 2 2 2 2 0 2 2 0 2
 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 2 2 0 2 0 2 0 2 0 2 1 2 2 2 0 2 2 2 2
 2 0 2 2 1 2 2 2 2 2 2 2 0 2 2 0 0 0 2 0 0 2 2 1 0 2 2 2 0 2 2 2 2 2 1 0 2
 0 2 2 0 2 2 0 2 0 2 2 1 2 2

In [32]:
#Save the classifier

import pickle
with open('best_rfc.pkl', 'wb') as fid:
    pickle.dump(rfc_GS.best_estimator_, fid)

### Scoring Model

In [33]:
#Load the classifier and check if it works

loaded_model = pickle.load(open('best_rfc.pkl', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.315175097276


In [34]:
loaded_model.predict(sparse.csr_matrix(features_df['X'].tolist()))[0:50]

array([2, 1, 2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 1, 1, 2,
       2, 2, 2, 1, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       2, 2, 2, 2])

In [35]:
probs = loaded_model.predict_proba(sparse.csr_matrix(features_df['X'].tolist()))
probs

array([[ 0.32692719,  0.33488713,  0.33818568],
       [ 0.3013956 ,  0.36788368,  0.33072072],
       [ 0.32692719,  0.33488713,  0.33818568],
       ..., 
       [ 0.32692719,  0.33488713,  0.33818568],
       [ 0.32692719,  0.33488713,  0.33818568],
       [ 0.34333933,  0.33903159,  0.31762909]])

In [36]:
scores_df = pd.DataFrame(tweets_clean_df['closest_city']).rename(columns = {'closest_city': 'city name'})
scores_df['negative probability'] = probs[:, 0]
scores_df['neutral probability'] = probs[:, 1] 
scores_df['positive probability'] = probs[:, 2] 
scores_df

Unnamed: 0,city name,negative probability,neutral probability,positive probability
21,New York City,0.326927,0.334887,0.338186
28,Chelsea,0.301396,0.367884,0.330721
29,El Segundo,0.326927,0.334887,0.338186
32,El Segundo,0.326927,0.334887,0.338186
34,Frisco,0.263840,0.274632,0.461529
42,Culver City,0.326927,0.334887,0.338186
62,Aliso Viejo,0.326927,0.334887,0.338186
69,Springfield Gardens,0.343339,0.339032,0.317629
74,Paradise,0.326927,0.334887,0.338186
108,San Francisco,0.300684,0.379869,0.319448
