In [1]:
import numpy as np
import pandas as pd


In [2]:
df_train = pd.read_csv("training_twitter_x_y_train.csv")
df_test = pd.read_csv("test_twitter_x_test.csv")

In [3]:
df_train.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)


In [4]:
df_test.head(3)

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)


In [24]:
# Creating Raw datasets
X_train = df_train['text']
y_train = df_train['airline_sentiment']

X_test = df_test['text']

In [25]:
len(X_train), len(X_test)

(10980, 3660)

In [26]:
# Creating stopwords 
from nltk.corpus import stopwords
import string
stops = (stopwords.words('english'))
stops += (list(string.punctuation))


# Stemming

In [27]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stemming(tweet):
    words = tweet.split()
    output = []
    for word in words:
        stemmed_word = ps.stem(word)
        output.append(stemmed_word)
    final_output = " ".join(output)
    return final_output

X_train = [stemming(tweet) for tweet in X_train]
X_test = [stemming(tweet) for tweet in X_test]

In [28]:
X_train[0], X_test[0]

('@southwestair I am schedul for the morning, 2 day after the fact, yes..not sure whi my even flight wa the onli one cancel flightl',
 "@americanair In car gng to dfw. pull over 1hr ago - veri ici roads. on-hold with AA sinc 1hr. can't reach arpt for aa2450. wat 2 do?")

# Lemmatizing

In [29]:
# Creating Lemmatizer object
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Creating pos tag
from nltk import pos_tag

In [30]:
def get_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

In [31]:
# twwet to words to clean tweet
import re
def tweet_to_clean_tweet(tweet):
    output_words = []
    letters_only = re.sub("[^a-zA-Z]", " ",tweet)
    words = letters_only.lower().split()
    
    for word in words:
        if word not in stops:
            pos = pos_tag([word])
            tag = pos[0][1]
            clean_word = lemmatizer.lemmatize(word, get_pos(tag))
            output_words.append(clean_word)
    final_tweet = " ".join(output_words)
    return final_tweet

In [33]:
X_train = [tweet_to_clean_tweet(tweet) for tweet in X_train]
X_test = [tweet_to_clean_tweet(tweet) for tweet in X_test]

In [34]:
def remove_airline_name(tweet):
    words = tweet.split()
    words.pop(0)
    tweet = " ".join(words)
    return tweet

X_train = [remove_airline_name(tweet) for tweet in X_train]
X_test = [remove_airline_name(tweet) for tweet in X_test]

In [35]:
X_train[0], X_test[0]

('schedul morning day fact yes sure whi even flight wa onli one cancel flightl',
 'car gng dfw pull hr ago veri ici road hold aa sinc hr reach arpt aa wat')

In [36]:
print(len(X_train))
print(len(X_test))
print(len(y_train))

10980
3660
10980


# Count Vectorize

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
vectorize = CountVectorizer(ngram_range=(1, 2))
X_train_features = vectorize.fit_transform(X_train)
X_test_features = vectorize.transform(X_test)
X_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [78]:
vectorize.get_feature_names()

['aa',
 'aa abl',
 'aa account',
 'aa advantag',
 'aa agent',
 'aa air',
 'aa amp',
 'aa anoth',
 'aa around',
 'aa avoid',
 'aa award',
 'aa awesom',
 'aa ba',
 'aa bc',
 'aa behalf',
 'aa believe',
 'aa botch',
 'aa cancel',
 'aa chang',
 'aa com',
 'aa consid',
 'aa contact',
 'aa cool',
 'aa cstmr',
 'aa current',
 'aa custom',
 'aa cx',
 'aa delay',
 'aa depart',
 'aa dfw',
 'aa direct',
 'aa divert',
 'aa dividendmil',
 'aa doe',
 'aa doesnt',
 'aa due',
 'aa dulles',
 'aa elit',
 'aa email',
 'aa employe',
 'aa ever',
 'aa famili',
 'aa family',
 'aa fan',
 'aa far',
 'aa fault',
 'aa feel',
 'aa ff',
 'aa firstclass',
 'aa flew',
 'aa flight',
 'aa frequent',
 'aa friend',
 'aa gate',
 'aa get',
 'aa give',
 'aa go',
 'aa help',
 'aa helpaa',
 'aa hold',
 'aa http',
 'aa ignor',
 'aa impress',
 'aa incompet',
 'aa jfk',
 'aa kc',
 'aa kill',
 'aa knew',
 'aa know',
 'aa kp',
 'aa la',
 'aa land',
 'aa lax',
 'aa left',
 'aa let',
 'aa lga',
 'aa lhr',
 'aa load',
 'aa lose',
 '

# SVM

In [83]:
# Building an SVM Model on this:
from sklearn.svm import SVC

svc = SVC(C = 0.5)
svc.fit(X_train_features, y_train)
svc.score(X_train_features,y_train)

0.820856102003643

In [84]:
y_pred = svc.predict(X_test_features)
y_pred

array(['negative', 'negative', 'negative', ..., 'neutral', 'positive',
       'negative'], dtype=object)

In [85]:
ans = np.savetxt("ans.csv",y_pred, fmt = '%s')

# Grid search on SVM

In [58]:
# trying grid search cv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [86]:
svc = SVC()
grid = {"C":[0.25, 0.5, 1, 3, 5],
       "gamma": [1e-3, 5e-4, 1e-4]}

grid_search = GridSearchCV( svc, grid, cv = KFold(n_splits=10,shuffle=True))
grid_search.fit(X_train_features, y_train)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

SVC(C=5, gamma=0.001)
0.6908014571948998


In [None]:
ans = np.savetxt("ans1.csv",y_pred, fmt = '%s')

# Random Forest

In [72]:
#Trying Random forest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500,criterion='gini',max_depth=50,min_samples_split=2,min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0,max_features='auto',max_leaf_nodes=None,min_impurity_decrease=0.0,
                             min_impurity_split=None,bootstrap=True,oob_score=False,n_jobs=None,random_state=None,
                             verbose=0,warm_start=False,class_weight=None,ccp_alpha=0.0,max_samples=None,)
rfc.fit(X_train_features, y_train)
y_pred_rfc = rfc.predict(X_test_features)
print(rfc.score(X_train_features, y_train))
ans = np.savetxt("ans_rf.csv",y_pred_rfc, fmt = '%s')

0.8403460837887068


# GridSearchCV for Random Forest

In [71]:
# We got score of 0.75
# Using grid Search over this
clf_RF = RandomForestClassifier()
grid_RF ={
    'n_estimators':[5,10,20,50,100,500],
    'max_depth':[10,20,30,35,50]
}
grid_search_RF = GridSearchCV(clf_RF, grid_RF)
grid_search_RF.fit(X_train_features, y_train)
print(grid_search_RF.best_estimator_)
print(grid_search_RF.best_score_)

RandomForestClassifier(max_depth=50, n_estimators=500)
0.7297814207650273


In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=50, n_estimators=50)
rfc.fit(X_train_features, y_train)
y_pred_rfc = rfc.predict(X_test_features)
ans = np.savetxt("ans_rf.csv",y_pred_rfc, fmt = '%s')

# Naive Bayes

In [87]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb.score(X_train_features, y_train)

0.9283242258652095

In [88]:
y_pred_nb = nb.predict(X_test_features)
ans = np.savetxt("ans_nb.csv",y_pred_rfc, fmt = '%s')

# Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train_features, y_train)