In [0]:
!pip install -q keras

In [3]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [4]:
!pip install xgboost==0.7.post3
!pip install textblob



In [6]:
import pandas as pd
import numpy as np
import io
from textblob import TextBlob


from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import xgboost,textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [0]:
from google.colab import files

In [126]:
uploaded = files.upload()

Saving train.csv to train (2).csv


In [128]:
uploaded = files.upload()

Saving test.csv to test (2).csv


In [64]:
uploaded = files.upload()

Saving sample_submission_LnhVWA4.csv to sample_submission_LnhVWA4.csv


In [0]:
train = pd.read_csv(io.StringIO(uploaded["train.csv"].decode('utf-8')))

In [0]:
test = pd.read_csv(io.StringIO(uploaded["test.csv"].decode("utf-8")))

In [0]:
#data preprocessing

!pip install HTMLParser
from html.parser import HTMLParser
html_parser= HTMLParser()

#Converts all named and numeric character references to the corresponding unicode characters
train["tweet"]= html_parser.unescape(train["tweet"])
test["tweet"]=html_parser.unescape(test["tweet"])

#remove http links
train['tweet'] = train['tweet'].str.replace('http\S+|www.\S+', '', case=False)
test['tweet'] = test['tweet'].str.replace('http\S+|www.\S+', '', case=False)


train['tweet']=train['tweet'].astype(str).str.lower()
test["tweet"]=test["tweet"].astype(str).str.lower()

#removing tagged words
def remove_word(tweet):
  return " ".join(filter(lambda x: x[0]!='@',tweet.split()))

train["tweet"]=train['tweet'].apply(remove_word)
test["tweet"]=test["tweet"].apply(remove_word)

#expanding apostrophes
apostrophes={"i'm":"i am","i'd":"i would","they'd":"they would","he'd":"he would",
             "she'd":"she would","it'd":"it would","'ve":" have","it's":"it is","don't":"do not",
             "didn't":"did not","aren't":"are not","isn't":"is not","wasn't":"was not","haven't":"have not",
             "hadn't":"had not","hasn't":"has not","won't":"will not","can't":"cannot","don't":'do not',
             "wouldn't":"would not","shouldn't":"should not","couldn't":"cound not","i'll":" i will",
             "they'll":"they will","it'll":"it will","he'll":"he will","she'll":"she will","y'll":"you all","'re":"are","doesn't":"does not"}

def apost_word(tweet):
  return " ".join([apostrophes.get(w,w) for w in tweet.split()])

train["tweet"]=train["tweet"].apply(apost_word)
test["tweet"]=test["tweet"].apply(apost_word)

#removing stop words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop =stopwords.words('english')


train["tweet"]=train["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test["tweet"]=test["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#removing punctuations
train['tweet']=train["tweet"].str.replace('.','')
train["tweet"]=train["tweet"].str.replace(',','')
train["tweet"]=train["tweet"].str.replace('!','')

test['tweet']=test["tweet"].str.replace('.','')
test["tweet"]=test["tweet"].str.replace(',','')
test["tweet"]=test["tweet"].str.replace('!','')

#removing the top 10 most common words
commonwords=pd.Series(' '.join(train['tweet']).split()).value_counts()[:7]
freq = list(commonwords.index)
train["tweet"] =train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test["tweet"] =test['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
commonwords2=pd.Series(' '.join(train['tweet']).split()).value_counts()[1:3]
freq2 = list(commonwords2.index)
train["tweet"] =train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq2))
test["tweet"] =test['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq2))

#removing the last 10 less common words
rare = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
rare = list(rare.index)

train["tweet"]=train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
test["tweet"]=test['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))

#standardizing  words
import itertools
def standardization(tweet):
  return ''.join(''.join(s)[:1] for _, s in itertools.groupby(tweet))
train["tweet"]=train["tweet"].apply(standardization)
test["tweet"]=test["tweet"].apply(standardization)

#removing stemming words through lemmatization
nltk.download('wordnet')
from textblob import Word
train["tweet"]=train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
test["tweet"]=test['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))



In [0]:
label= train["label"]
tweet=train["tweet"]

In [0]:
#train test split
train_X,val_X,train_y,val_y=model_selection.train_test_split(train["tweet"],train["label"],test_size=0.2,stratify=train["label"],random_state=0)

In [0]:
#count vectorizer
count_vect = CountVectorizer(analyzer='word',token_pattern='\w{1,}')
count_vect.fit(train["tweet"])
train_X_count=count_vect.transform(train_X)
val_X_count=count_vect.transform(val_X)

In [0]:
#word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',token_pattern='\w{1,}', max_features=2000)
tfidf_vect.fit(train["tweet"])
train_X_tfidf=tfidf_vect.transform(train_X)
val_X_tfidf=tfidf_vect.transform(val_X)

In [0]:
#N-gram level tf-idf
tfidf_vect_ngram=TfidfVectorizer(analyzer='word',token_pattern='\w{1,}',ngram_range=(1,2),max_features=2000)
tfidf_vect_ngram.fit(train["tweet"])
train_X_tfidf_ngram=tfidf_vect_ngram.transform(train_X)
val_X_tfidf_ngram=tfidf_vect_ngram.transform(val_X)


In [0]:
#character level tf-idf 
tfidf_vect_ngram_chars=TfidfVectorizer(analyzer='char',token_pattern='\w{1,}',ngram_range=(2,3),max_features=2000)
tfidf_vect_ngram_chars.fit(train["tweet"])
train_X_tfidf_ngram_chars=tfidf_vect_ngram_chars.transform(train_X)
val_X_tfidf_ngram_chars=tfidf_vect_ngram_chars.transform(val_X)

In [0]:
def train_model(classifier,feature_vector_train,label,feature_vector_valid,is_neural_net=False):
  classifier.fit(feature_vector_train,label)
  predictions=classifier.predict(feature_vector_valid)
  
  if is_neural_net:
    predictions=predictions.argmax(axis=-1)
  
  
  return metrics.f1_score(predictions,val_y)

In [200]:
#naives bayes on count vectors

f1_score = train_model(naive_bayes.MultinomialNB(), train_X_count, train_y,val_X_count)
print("NB, Count Vectors",f1_score)

#naive bayes on word level tf-idf vectors
f1_score=train_model(naive_bayes.MultinomialNB(),train_X_tfidf,train_y,val_X_tfidf)
print("NB, Word Level TF-IDF Vectors", f1_score)

#naive bayes on n-gram level TF-IDF vecotrs
f1_score = train_model(naive_bayes.MultinomialNB(), train_X_tfidf_ngram, train_y,val_X_tfidf_ngram)
print("NB, n-gram Vectors",f1_score)

#naive bayes on character level tf-idf vectors
f1_score=train_model(naive_bayes.MultinomialNB(),train_X_tfidf_ngram_chars,train_y,val_X_tfidf_ngram_chars)
print("NB, character Level TF-IDF Vectors", f1_score)



NB, Count Vectors 0.7682403433476396
NB, Word Level TF-IDF Vectors 0.773006134969325
NB, n-gram Vectors 0.7734939759036146
NB, character Level TF-IDF Vectors 0.7817371937639198


In [201]:
#Linear Classifier

#count vectors
f1_score = train_model(linear_model.LogisticRegression(), train_X_count, train_y,val_X_count)
print("LR, Count Vectors",f1_score)

#word level tf-idf vectors
f1_score = train_model(linear_model.LogisticRegression(), train_X_tfidf, train_y,val_X_tfidf)
print("LR, word level vectors",f1_score)

#ngram level tf-idf vectors
f1_score = train_model(linear_model.LogisticRegression(), train_X_tfidf_ngram, train_y,val_X_tfidf_ngram)
print("LR, ngram level Vectors",f1_score)

#character level tf-idf vectors
f1_score = train_model(linear_model.LogisticRegression(), train_X_tfidf_ngram_chars, train_y,val_X_tfidf_ngram_chars)
print("LR, charlevel Vectors",f1_score)

LR, Count Vectors 0.722860791826309
LR, word level vectors 0.7209944751381215
LR, ngram level Vectors 0.7227586206896552
LR, charlevel Vectors 0.7661691542288556


In [202]:
#SVM

#count vectors
f1_score = train_model(svm.SVC(), train_X_count, train_y,val_X_count)
print("svm, Count Vectors",f1_score)

#word level tf-idf vectors
f1_score = train_model(linear_model.LogisticRegression(), train_X_tfidf, train_y,val_X_tfidf)
print("svm, word level vectors",f1_score)

#ngram level tf-idf vectors
f1_score = train_model(linear_model.LogisticRegression(), train_X_tfidf_ngram, train_y,val_X_tfidf_ngram)
print("svm, ngram level Vectors",f1_score)

#character level tf-idf vectors
f1_score = train_model(linear_model.LogisticRegression(), train_X_tfidf_ngram_chars, train_y,val_X_tfidf_ngram_chars)
print("svm, charlevel Vectors",f1_score)


svm, Count Vectors 0.0
svm, word level vectors 0.7209944751381215
svm, ngram level Vectors 0.7227586206896552
svm, charlevel Vectors 0.7661691542288556


  'recall', 'true', average, warn_for)


In [203]:
#RF Model

#count vectors
f1_score = train_model(ensemble.RandomForestClassifier(), train_X_count, train_y,val_X_count)
print("RF, Count Vectors",f1_score)

#word level tf-idf vectors
f1_score = train_model(ensemble.RandomForestClassifier(), train_X_tfidf, train_y,val_X_tfidf)
print("RF, word level vectors",f1_score)

#ngram level tf-idf vectors
f1_score = train_model(ensemble.RandomForestClassifier(), train_X_tfidf_ngram, train_y,val_X_tfidf_ngram)
print("RF, ngram level Vectors",f1_score)

#character level tf-idf vectors
f1_score = train_model(ensemble.RandomForestClassifier(), train_X_tfidf_ngram_chars, train_y,val_X_tfidf_ngram_chars)
print("RF, charlevel Vectors",f1_score)


RF, Count Vectors 0.6685082872928175
RF, word level vectors 0.6949806949806949
RF, ngram level Vectors 0.6675427069645203
RF, charlevel Vectors 0.6694444444444444


In [204]:
#XGBoost Model

#count vectors
f1_score = train_model(xgboost.XGBClassifier(), train_X_count.tocsc(), train_y,val_X_count.tocsc())
print("XGB, Count Vectors",f1_score)

#word level tf-idf vectors
f1_score = train_model(xgboost.XGBClassifier(), train_X_tfidf.tocsc(), train_y,val_X_tfidf.tocsc())
print("XGB, word level vectors",f1_score)

#ngram level tf-idf vectors
f1_score = train_model(xgboost.XGBClassifier(), train_X_tfidf_ngram.tocsc(), train_y,val_X_tfidf_ngram.tocsc())
print("XGB, ngram level Vectors",f1_score)

#character level tf-idf vectors
f1_score = train_model(xgboost.XGBClassifier(), train_X_tfidf_ngram_chars.tocsc(), train_y,val_X_tfidf_ngram_chars.tocsc())
print("XGB, charlevel Vectors",f1_score)


  if diff:


XGB, Count Vectors 0.47957371225577256


  if diff:


XGB, word level vectors 0.4814159292035398


  if diff:


XGB, ngram level Vectors 0.4929577464788732
XGB, charlevel Vectors 0.7910798122065728


  if diff:


In [25]:
#shallow NN

def create_model_architecture(input_size):
  input_layer = layers.Input((input_size,),sparse=True)
  hidden_layer= layers.Dense(100,activation='relu')(input_layer)
  output_layer = layers.Dense(1,activation='sigmoid')(hidden_layer)
 
  classifier=models.Model(inputs=input_layer,outputs=output_layer)
  classifier.compile(optimizer=optimizers.Adam(),loss='binary_crossentropy')
  return classifier

classifier=create_model_architecture(train_X_count.shape[1])
f1_score= train_model(classifier,train_X_count,train_y,val_X_count,is_neural_net=True)
print("NN, Count Vectorizer",f1_score)

classifier2=create_model_architecture(train_X_tfidf.shape[1])
f1_score= train_model(classifier2,train_X_tfidf,train_y,val_X_tfidf,is_neural_net=True)
print("NN, word tf-idf",f1_score)

classifier3=create_model_architecture(train_X_tfidf_ngram.shape[1])
f1_score= train_model(classifier3,train_X_tfidf_ngram,train_y,val_X_tfidf_ngram,is_neural_net=True)
print("NN, ngram tf-idf Vectorizer",f1_score)

classifier4=create_model_architecture(train_X_tfidf_ngram_chars.shape[1])
f1_score= train_model(classifier4,train_X_tfidf_ngram_chars,train_y,val_X_tfidf_ngram_chars,is_neural_net=True)
print("NN, charlevel tf-idf Vectorizer",f1_score)


Epoch 1/1
NN, Count Vectorizer 0.0


  'recall', 'true', average, warn_for)


Epoch 1/1
NN, word tf-idf 0.0
Epoch 1/1
NN, ngram tf-idf Vectorizer 0.0
Epoch 1/1
NN, charlevel tf-idf Vectorizer 0.0


In [0]:
#found XGBoost classifier for char level ngram as the best model
test_tfidf_ngram_chars=tfidf_vect_ngram_chars.transform(test["tweet"])

classifier=xgboost.XGBClassifier()
classifier.fit(train_X_tfidf_ngram_chars,train_y)
predictions=classifier.predict(test_tfidf_ngram_chars)

df=pd.DataFrame({"id":test.id,"label":predictions})

In [208]:
from IPython.display import HTML
import base64

def create_download_link( df, title = "Download CSV file", filename = "data_tfidf_ngram_chars_XGB.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df)