In [1]:
import luigi
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
import pickle



In [2]:
tweets_df = pd.read_csv('airline_tweets.csv', encoding = "ISO-8859-1")
cities_df = pd.read_csv('cities.csv', encoding = "ISO-8859-1")

In [3]:
tweets_df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


In [4]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 20 columns):
_unit_id                        14640 non-null int64
_golden                         14640 non-null bool
_unit_state                     14640 non-null object
_trusted_judgments              14640 non-null int64
_last_judgment_at               14584 non-null object
airline_sentiment               14640 non-null object
airline_sentiment:confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason:confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 

In [5]:
cities_df.head()

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,3040051,les Escaldes,les Escaldes,"Ehskal'des-Ehndzhordani,Escaldes,Escaldes-Engo...",42.50729,1.53414,P,PPLA,AD,,8,,,,15853,,1033,Europe/Andorra,2008-10-15
1,3041563,Andorra la Vella,Andorra la Vella,"ALV,Ando-la-Vyey,Andora,Andora la Vela,Andora ...",42.50779,1.52109,P,PPLC,AD,,7,,,,20430,,1037,Europe/Andorra,2010-05-30
2,290594,Umm al Qaywayn,Umm al Qaywayn,"Oumm al Qaiwain,Oumm al QaÃ¯waÃ¯n,Um al Kawain...",25.56473,55.55517,P,PPLA,AE,,7,,,,44411,,2,Asia/Dubai,2014-10-07
3,291074,Ras al-Khaimah,Ras al-Khaimah,"Julfa,Khaimah,RKT,Ra's al Khaymah,Ra's al-Chai...",25.78953,55.9432,P,PPLA,AE,,5,,,,115949,,2,Asia/Dubai,2015-12-05
4,291696,Khawr FakkÄn,Khawr Fakkan,"Fakkan,FakkÄn,Khawr Fakkan,Khawr FakkÄn,Khaw...",25.33132,56.34199,P,PPL,AE,,6,,,,33575,,20,Asia/Dubai,2013-10-25


In [6]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23278 entries, 0 to 23277
Data columns (total 19 columns):
geonameid            23278 non-null int64
name                 23278 non-null object
asciiname            23278 non-null object
alternatenames       21281 non-null object
latitude             23278 non-null float64
longitude            23278 non-null float64
feature class        23278 non-null object
feature code         23278 non-null object
country code         23265 non-null object
cc2                  242 non-null object
admin1 code          23271 non-null object
admin2 code          16067 non-null object
admin3 code          7026 non-null object
admin4 code          2213 non-null object
population           23278 non-null int64
elevation            3686 non-null float64
dem                  23278 non-null int64
timezone             23278 non-null object
modification date    23278 non-null object
dtypes: float64(3), int64(3), object(13)
memory usage: 3.4+ MB


In [7]:
#drop missing values and values with no tweet coordinate
tweets_clean_df = tweets_df[['airline_sentiment', 'tweet_coord']].dropna()
tweets_clean_df = tweets_clean_df.loc[tweets_clean_df['tweet_coord'] != '[0.0, 0.0]']
tweets_clean_df.tweet_coord

21        [40.74804263, -73.99295302]
28          [42.361016, -71.02000488]
29        [33.94540417, -118.4062472]
32       [33.94209449, -118.40410103]
34          [33.2145038, -96.9321504]
42        [34.0219817, -118.38591198]
62       [33.57963333, -117.73024772]
69         [40.6413712, -73.78311558]
74       [36.08457854, -115.13780136]
108      [37.79374402, -122.39327564]
115          [26.074379, -80.1416831]
124       [33.9469039, -118.40716847]
126      [51.04345575, -114.06071363]
128       [40.64662464, -73.77090177]
147      [33.94652852, -118.40766257]
189        [38.9128188, -77.00798226]
205       [32.84359605, -96.84910929]
211        [32.8437698, -96.84928399]
249       [26.06726717, -80.14433663]
264         [32.8454782, -96.8504585]
285       [39.24087254, -94.63994975]
303      [37.77465018, -122.44032176]
337      [33.94696831, -118.40747994]
409       [40.63767372, -74.11075451]
473       [40.78986648, -73.10068286]
475        [42.5696777, -71.42056878]
499      [37

In [8]:
#preprocessing to convert tweet coord to float for distance calculation
ls = [row.strip('][').split(',') for row in tweets_clean_df.tweet_coord]
ls2 = [[float(x) for x in row] for row in ls ]

In [9]:
# function to calculate Euclidean distance between tweet geolocation and all other cities in cities.csv
def closest_city(coord,frame):
    dist = np.sqrt(((frame[['latitude','longitude']] - coord) ** 2).sum(1))
    return frame[['name']].iloc[(dist).idxmin()][0]

In [10]:
# Find nearest city to each tweet geolocation based on Euclidean distance
tweets_clean_df['nearest_city']=[closest_city(x,cities_df) for x in ls2]

In [11]:
tweets_clean_df.head()

Unnamed: 0,airline_sentiment,tweet_coord,nearest_city
21,positive,"[40.74804263, -73.99295302]",New York City
28,negative,"[42.361016, -71.02000488]",Chelsea
29,neutral,"[33.94540417, -118.4062472]",El Segundo
32,negative,"[33.94209449, -118.40410103]",El Segundo
34,positive,"[33.2145038, -96.9321504]",Frisco


In [12]:
#replace airline_sentiment with numerical labels for classification
labels = {'negative': 0, 'neutral': 1, 'positive': 2}
tweets_clean_df.replace({'airline_sentiment': labels}, inplace=True)

In [13]:
X = pd.get_dummies(tweets_clean_df.nearest_city)
y = tweets_clean_df.airline_sentiment

In [14]:
X.head()

Unnamed: 0,Addison,Aldine,Aliso Viejo,Allen,American Fork,Anaheim,Angeles City,Annapolis,Arbutus,Ashford,...,Whitestone,Williamsport,Willow Grove,Windsor,Winnipeg,Winter Park,Wolverhampton,Woodstock,Xiuying,Zionsville
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
features_df = pd.concat([X,y], axis=1)
features_df.head()

Unnamed: 0,Addison,Aldine,Aliso Viejo,Allen,American Fork,Anaheim,Angeles City,Annapolis,Arbutus,Ashford,...,Williamsport,Willow Grove,Windsor,Winnipeg,Winter Park,Wolverhampton,Woodstock,Xiuying,Zionsville,airline_sentiment
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [None]:
# save features to csv for next stage of pipeline
#features_df.to_csv('features.csv')

In [16]:
# load in features for machine learning analysis
#features_df = pd.read_csv('features.csv', index_col=0)
X = features_df.drop(['airline_sentiment'], axis=1)
y = features_df.airline_sentiment

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print("Number samples X_train dataset: ", X_train.shape)
print("Number samples y_train dataset: ", y_train.shape)
print("Number samples X_test dataset: ", X_test.shape)
print("Number samples y_test dataset: ", y_test.shape)

Number samples X_train dataset:  (684, 291)
Number samples y_train dataset:  (684,)
Number samples X_test dataset:  (171, 291)
Number samples y_test dataset:  (171,)


In [18]:
# SMOTE oversampling to deal with class imbalance

print("Before OverSampling, counts of label '2': {}".format(sum(y_train==2)))
print("Before OverSampling, counts of label '1': {} ".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_r, y_train_r = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_r.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_r.shape))

print("After OverSampling, counts of label '2': {}".format(sum(y_train_r==2)))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_r==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_r==0)))

Before OverSampling, counts of label '2': 125
Before OverSampling, counts of label '1': 102 
Before OverSampling, counts of label '0': 457 

After OverSampling, the shape of train_X: (1371, 291)
After OverSampling, the shape of train_y: (1371,) 

After OverSampling, counts of label '2': 457
After OverSampling, counts of label '1': 457
After OverSampling, counts of label '0': 457


In [19]:
# Definite class imbalance. But to avoid data leakage, SMOTE needs to be applied within each fold for CV

sm = SMOTE()
rf = RandomForestClassifier()

pipeline = Pipeline([('sm', sm), ('rf', rf)])

params = {'rf__max_depth' : list(range(2,5)),
          'rf__max_features' : ['auto','sqrt'],
          'rf__bootstrap' : [True, False],
          'rf__n_estimators' : [10, 20, 50, 100]
         }

grid = GridSearchCV(pipeline, params, cv = 5)

grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('sm',
                                        SMOTE(k_neighbors=5, kind='deprecated',
                                              m_neighbors='deprecated',
                                              n_jobs=1, out_step='deprecated',
                                              random_state=None, ratio=None,
                                              sampling_strategy='auto',
                                              svm_estimator='deprecated')),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',...
                                                               n_estimators='warn',
                          

In [20]:
# Print the best parameters and highest score
print("Best parameters found: ", grid.best_params_)
print("Highest score found: ", grid.best_score_)
print('Train score: %0.4f' % grid.best_estimator_.score(X_train, y_train))
print('Test score: %0.4f' % grid.best_estimator_.score(X_test, y_test))
print('F1 Score (Macro Avg): %0.4f' % f1_score(y_test, grid.best_estimator_.predict(X_test), average='macro'))    

Best parameters found:  {'rf__bootstrap': True, 'rf__max_depth': 2, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
Highest score found:  0.6695906432748538
Train score: 0.6886
Test score: 0.6842
F1 Score (Macro Avg): 0.3176


  'precision', 'predicted', average, warn_for)


In [21]:
hyperparams_df = pd.DataFrame(grid.cv_results_)
display(hyperparams_df.sort_values('rank_test_score').head())

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__bootstrap,param_rf__max_depth,param_rf__max_features,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
30,0.072384,0.002832,0.00629,0.001148,False,2,sqrt,50,"{'rf__bootstrap': False, 'rf__max_depth': 2, '...",0.666667,0.666667,0.683824,0.669118,0.661765,0.669591,0.00748,1
13,0.044666,0.003865,0.003581,0.000547,True,3,sqrt,20,"{'rf__bootstrap': True, 'rf__max_depth': 3, 'r...",0.681159,0.65942,0.676471,0.669118,0.661765,0.669591,0.008337,1
34,0.079128,0.004526,0.005939,0.000307,False,3,auto,50,"{'rf__bootstrap': False, 'rf__max_depth': 3, '...",0.673913,0.666667,0.676471,0.669118,0.661765,0.669591,0.005208,1
7,0.131894,0.002282,0.013727,0.003066,True,2,sqrt,100,"{'rf__bootstrap': True, 'rf__max_depth': 2, 'r...",0.666667,0.666667,0.691176,0.669118,0.654412,0.669591,0.011914,1
37,0.045514,0.004399,0.002986,5.7e-05,False,3,sqrt,20,"{'rf__bootstrap': False, 'rf__max_depth': 3, '...",0.65942,0.666667,0.683824,0.669118,0.669118,0.669591,0.007935,1


In [23]:
# View a list of the features and their importance scores
lis=list(zip(tweets_clean_df.nearest_city, grid.best_estimator_.named_steps["rf"].feature_importances_))
print(sorted(lis, key=lambda t: t[1], reverse=True)[:15])

[('Matthews', 0.09348910181315376), ('Portland', 0.06339685863130715), ('Millbrae', 0.06301260029178735), ('Polanco', 0.05916664490364212), ('San Francisco', 0.05874578898165345), ('El Segundo', 0.05383679200648061), ('Elizabeth', 0.051382027554076413), ('Millbrae', 0.04638714517678519), ('Millbrae', 0.0450880602024172), ('Bensenville', 0.040552821901613585), ('Palm Desert', 0.029719253796200506), ('New York City', 0.02969320982865009), ('Egypt Lake-Leto', 0.028951194840039777), ('Phra Pradaeng', 0.027979449573330317), ('North Charleston', 0.024511353891086527)]


In [None]:
#Save the classifier
#import pickle
#with open('rfc.pkl', 'wb') as f:
#    pickle.dump(rfc_grid.best_estimator_, f)

In [None]:
#Load the best classifier 
#rfc_model = pickle.load(open('model.pkl', 'rb'))
#result = rfc_model.score(X_test, y_test)
#print(result)

In [25]:
prob = grid.best_estimator_.predict_proba(X)
len(prob[:,2])

855

In [26]:
scores_df = pd.DataFrame(tweets_clean_df['nearest_city'])
scores_df['negative probability'] = list(prob[:, 0])
scores_df['neutral probability'] = list(prob[:, 1])
scores_df['positive probability'] = list(prob[:, 2])
scores_df

Unnamed: 0,nearest_city,negative probability,neutral probability,positive probability
21,New York City,0.339688,0.320611,0.339701
28,Chelsea,0.343276,0.323193,0.333531
29,El Segundo,0.345636,0.320172,0.334193
32,El Segundo,0.345636,0.320172,0.334193
34,Frisco,0.343276,0.323193,0.333531
42,Culver City,0.343276,0.323193,0.333531
62,Aliso Viejo,0.336390,0.336621,0.326989
69,Springfield Gardens,0.345206,0.323728,0.331066
74,Paradise,0.343276,0.323193,0.333531
108,San Francisco,0.331531,0.333842,0.334627


In [27]:
scores_df_sorted = scores_df.sort_values(by=['positive probability'], ascending=False)
scores_df_sorted

Unnamed: 0,nearest_city,negative probability,neutral probability,positive probability
5374,Santa Ana,0.326615,0.305790,0.367595
6237,Brushy Creek,0.326572,0.306518,0.366910
2896,Etobicoke,0.329417,0.309301,0.361282
14391,Flagami,0.329432,0.309718,0.360849
12342,Palm Springs,0.332791,0.313113,0.354097
2417,Palm Springs,0.332791,0.313113,0.354097
2416,Palm Springs,0.332791,0.313113,0.354097
3133,Palm Springs,0.332791,0.313113,0.354097
14511,Augusta,0.336289,0.316279,0.347432
8158,North Salt Lake,0.336518,0.316134,0.347348


In [28]:
scores_df_sorted.drop_duplicates()

Unnamed: 0,nearest_city,negative probability,neutral probability,positive probability
5374,Santa Ana,0.326615,0.305790,0.367595
6237,Brushy Creek,0.326572,0.306518,0.366910
2896,Etobicoke,0.329417,0.309301,0.361282
14391,Flagami,0.329432,0.309718,0.360849
12342,Palm Springs,0.332791,0.313113,0.354097
14511,Augusta,0.336289,0.316279,0.347432
8158,North Salt Lake,0.336518,0.316134,0.347348
10439,Tempe Junction,0.341543,0.312770,0.345687
10450,Warwick,0.339491,0.317050,0.343459
4901,Houston,0.336142,0.321801,0.342057


In [None]:
# save sorted list of cities by predicted positive sentiment
#scores_df_sorted.to_csv('scores.csv')