In [None]:
#This is another method which is based on the frequency method but it is different to the count vectorization in the sense that
#it takes into account not just the occurrence of a word in a single document but in the entire corpus. 
#TF-IDF score represents the relative importance of a term in the document and the entire corpus.
#In this classifier word level tfidf scores are considered.

In [1]:
# Import Libraries
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
%matplotlib inline
import matplotlib.pyplot as plt
import pandas, numpy as np, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [2]:
# load the dataset
train = pandas.read_csv('/Users/PrasadiA/Desktop/train.csv', encoding = "unicode_escape")
test = pandas.read_csv('/Users/PrasadiA/Desktop/tests.csv', encoding = "unicode_escape")

In [3]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['Tweet text'], train['Label'])

In [4]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['Tweet text'])
tfidf_vect.fit(test['Text'])

xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
xtest_tfidf =  tfidf_vect.transform(test['Text'])

In [5]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, feature_vector_test, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    #predict the labels on tests data set
    predictions_test = classifier.predict(feature_vector_test)
    np.savetxt("prediction.csv", predictions_test, delimiter=",")
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [6]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, xtest_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.6798748696558915


In [7]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf, xtest_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

LR, WordLevel TF-IDF:  0.8404588112617309




In [8]:
# SVM on Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf, xtest_tfidf)
print ("SVM, WordLevel TF-IDF: ", accuracy)



SVM, WordLevel TF-IDF:  0.4754953076120959


In [9]:
# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, xtest_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)



RF, WordLevel TF-IDF:  0.8164754953076121
