## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [4]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [5]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


## Remove urls

In [6]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
test['message'] = test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

## Make lower case

In [7]:
train['message'] = train['message'].str.lower()
test['message'] = test['message'].str.lower()

## Strip out punctuation marks and numerals

In [8]:
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

train['message'] = train['message'].apply(remove_punctuation_numbers)
test['message'] = test['message'].apply(remove_punctuation_numbers)

## Resampling

In [9]:
from sklearn.utils import resample
believe = train[train['sentiment'] == 1]
no_belief = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
news = train[train['sentiment'] == 2]

# Scale samples
believe_sampled = resample(believe, 
                              replace = True,
                              n_samples = len(believe),
                              random_state = 27)

no_belief_sampled = resample(no_belief, 
                              replace = True,
                              n_samples = 7000,
                              random_state = 27)

neutral_sampled = resample(neutral, 
                              replace = True,
                              n_samples = 7000,
                              random_state = 27)

news_sampled = resample(news, 
                              replace = True,
                              n_samples = 7000,
                              random_state = 27)

sampled = pd.concat([believe_sampled, no_belief_sampled, neutral_sampled, news_sampled])


In [10]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dio...,625221
1,1,its not like we lack evidence of anthropogenic...,126103
2,2,rt rawstory researchers say we have three year...,698562
3,1,todayinmaker wired was a pivotal year in the...,573736
4,1,rt soynoviodetodas its and a racist sexist cl...,466954


In [11]:
sampled.sentiment.value_counts()

 1    8530
-1    7250
 2    7250
 0    7250
Name: sentiment, dtype: int64

## Splitting out the X variable from the target

In [12]:
y = sampled['sentiment']
X = sampled['message']

## Turning text into something your model can read

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [14]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.17,shuffle=True, stratify=y, random_state=11)

## Training the model and evaluating using the validation set 

In [15]:
# parameters = {'kernel':('linear', 'rbf'), 
#               'C':(0.25,1.0),
#               'gamma': (1,2)}

In [16]:
from sklearn.svm import SVC

In [17]:
svm = SVC(kernel='linear',C=1,gamma=0.05)
# clf = GridSearchCV(svm, parameters)
svm.fit(X_train,y_train)

SVC(C=1, gamma=0.05, kernel='linear')

In [18]:
y_opt = svm.predict(X_val)
print("The accuracy for our tuned model is: ", accuracy_score(y_val, y_opt))

The accuracy for our tuned model is:  0.9408850726552179


In [None]:
# rfc = RandomForestClassifier()
# rfc.fit(X_train, y_train)
# rfc_pred = rfc.predict(X_val)

## Checking the performance of our model on the validation set

In [19]:
f1_score(y_val, y_opt, average="macro")

0.942324009777612

## Getting our test set ready 

In [None]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [None]:
y_pred = svm.predict(test_vect)

In [None]:
test['sentiment'] = y_pred

In [None]:
test.head()

## Creating an output csv for submission

In [None]:
test[['tweetid','sentiment']].to_csv('testsubmission8.csv', index=False)