In [35]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer  

import gensim.downloader as api
from gensim.test.utils import common_texts
from gensim.sklearn_api import W2VTransformer

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_score,recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

In [36]:
def load_data(filename):
    dataset= pd.read_csv(filename)
    return dataset

dataset= load_data('./train.csv')
dataset

Unnamed: 0,text,labels
0,@realDonaldTrump This is one of the worst time...,0
1,How about the crowd in Oval in today's #AUSvIN...,1
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0
3,#etsy shop: Benedict Donald so called presiden...,1
4,@realDonaldTrump Good build a wall around Arka...,0
...,...,...
5261,@ICC should allow ms dhoni to keep glove. It i...,1
5262,Trump on avoiding movie pirating: 'of course y...,1
5263,I noticed recently Jamie Oliver's restaurants ...,1
5264,#TeamIndia geared up is okay. What's on the GL...,0


In [37]:
def separate_datalabels(dataset):
    return dataset['text'], dataset['labels']

dataset, labels= separate_datalabels(dataset)

In [38]:
def remove_emoji(document):
    emoji_pattern = re.compile("["
                                "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                "\U0001F300-\U0001F5FF"  # symbols & pictographs
                                "\U0001F600-\U0001F64F"  # emoticons
                                "\U0001F680-\U0001F6FF"  # transport & map symbols
                                "\U0001F700-\U0001F77F"  # alchemical symbols
                                "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                "\U0001FA00-\U0001FA6F"  # Chess Symbols
                                "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                                "\U00002702-\U000027B0"  # Dingbats
                                "\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    document= emoji_pattern.sub(r'', document) # no emoji
    return document

def remove_urls(document):
    document = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', document, flags=re.MULTILINE)
    return document

def remove_punctuations(document):
    punctuations = '''0123456789!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    for p in document: 
        if p in punctuations: 
            document = document.replace(p, "") 
 
    return document   

def remove_stopwords(document):
    stop_words = set(stopwords.words('english'))
    tokenized_words= word_tokenize(document)
    filtered_sentence = [w for w in tokenized_words if not w in stop_words] 
    document= " ".join(filtered_sentence)

    return document


def stemming(document):
    ps = PorterStemmer() 
    words = word_tokenize(document) 
    stemmed_sentence = [ps.stem(w) for w in words] 
    document= " ".join(stemmed_sentence)

    return document



def clean_corpus(dataset):
    cleaned_dataset= list()
    
    for document in dataset:
        document= document.lower()                      # convert to lower case (shrink the #letters)
        
        document= remove_emoji(document)                # remove emojis- not required for sentiment analysis
        
        document= remove_urls(document)                 # remove urls- not required for sentiment analysis
        
        document= remove_punctuations(document)         # remove punctuations- not required for sentiment analysis
        
        document= remove_stopwords(document)            # tokenize and remove stop words
        
        document= stemming(document)                    # perform document stemming
        
        cleaned_dataset.append(document)
    
    return cleaned_dataset
    
    
    
dataset= clean_corpus(dataset)

In [39]:
# glove_model = api.load('glove-wiki-gigaword-300')

In [40]:
# vectorizer = CountVectorizer(stop_words= 'english')
# dataset = vectorizer.fit_transform(dataset)
# print(dataset)

# tfidf_transformer = TfidfTransformer()
# dataset = tfidf_transformer.fit_transform(dataset)
# print (dataset.shape)

# dataset= normalize(dataset)
# print (dataset)

In [42]:
# Create a model to represent each word by a 10 dimensional vector.
model = W2VTransformer(size=10, min_count=1, seed=1)

# What is the vector representation of the word 'graph'?
wordvecs = model.fit(common_texts).transform(['graph', 'system'])
assert wordvecs.shape == (2, 10)

In [45]:
model.fit_transform(dataset, labels)

KeyError: "word 'realdonaldtrump one worst time american caus seriou damag countri sure dont wish happi father day everyon know your terribl father derangeddonald trumpisatraitor trumpisacrimin' not in vocabulary"

In [21]:
X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.2, random_state=0)

In [31]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(kernel= 'rbf', C= 2,degree= 6)      #experiment with different 'C' values
SVM.fit(X_train,y_train)

SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=6, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
predictions_SVM = SVM.predict(X_test)

In [33]:
print("Accuracy Score: ",accuracy_score(predictions_SVM, y_test)*100)

Accuracy Score:  63.662239089184055
