In [2]:
# Import Libraries
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
%matplotlib inline
import matplotlib.pyplot as plt
import pandas, numpy as np, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [3]:
# load the dataset
train = pandas.read_csv('/Users/PrasadiA/Desktop/train.csv', encoding = "unicode_escape")
test = pandas.read_csv('/Users/PrasadiA/Desktop/tests.csv', encoding = "unicode_escape")

In [4]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['Tweet text'], train['Label'])

In [5]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train['Tweet text'])
tfidf_vect_ngram.fit(test['Text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test['Text'])


In [6]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, feature_vector_test, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    #predict the labels on tests data set
    predictions_test = classifier.predict(feature_vector_test)
    np.savetxt("prediction.csv", predictions_test, delimiter=",")
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [6]:
# Naive Bayes on ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, xtest_tfidf_ngram)
print ("NB, Ngram TF-IDF: ", accuracy)

NB, Ngram TF-IDF:  0.6444212721584984


In [7]:
# Linear Classifier on ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, xtest_tfidf_ngram)
print ("LR, Ngram TF-IDF: ", accuracy)

LR, Ngram TF-IDF:  0.6110531803962461




In [8]:
# SVM on ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, xtest_tfidf_ngram)
print ("SVM, Ngram TF-IDF: ", accuracy)



SVM, Ngram TF-IDF:  0.4921793534932221


In [9]:
# RF on ngram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, xtest_tfidf_ngram)
print ("RF, Ngram TF-IDF: ", accuracy)



RF, Ngram TF-IDF:  0.6141814389989573
