In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
pos_sentiment = train[train['sentiment']==1]
neg_sentiment = train[train['sentiment']==-1]
neutral = train[train['sentiment']==0]
news = train[train['sentiment']==2]

# Upsample minority
neg_sentiment_upsampled = resample(neg_sentiment, replace=True, n_samples=len(pos_sentiment), random_state=27)
neutral_upsampled = resample(neutral, replace=True, n_samples=len(pos_sentiment), random_state=27)
news_upsampled = resample(news, replace=True, n_samples=len(pos_sentiment), random_state=27)

In [5]:
# Combine Majority and Upsampled Minority Datasets
upsampled = pd.concat([pos_sentiment, neg_sentiment_upsampled, neutral_upsampled, news_upsampled])

In [6]:
y = upsampled['sentiment']
X = upsampled['message']

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [8]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.3,shuffle=True, random_state=27)

In [9]:
gs_params = {'kernel':('linear', 'rbf'),
             'C':(0.25,0.5,1.0),
             'gamma':(0.5,1,2)}

In [10]:
sv = SVC()
clf = GridSearchCV(sv,gs_params)
sv.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
sv_pred = sv.predict(X_val)

In [17]:
print('Classification Report')
print(classification_report(y_val, sv_pred))

Classification Report
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99      2565
           0       0.96      0.97      0.97      2468
           1       0.92      0.92      0.92      2586
           2       0.95      0.94      0.95      2617

    accuracy                           0.96     10236
   macro avg       0.96      0.96      0.96     10236
weighted avg       0.96      0.96      0.96     10236



In [24]:
f1_score(y_val, sv_pred, average='macro')

0.9564412099644224

In [25]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [26]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [27]:
testx = test['message']
test_vect = vectorizer.transform(testx)

In [28]:
y_pred = sv.predict(test_vect)

In [29]:
test['sentiment'] = y_pred

In [30]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,1


In [31]:
testsub = pd.DataFrame(y_pred, columns=['sentiment'])
testsub.head()

Unnamed: 0,sentiment
0,1
1,1
2,1
3,1
4,1


In [32]:
from google.colab import files
uploaded = files.upload()

Saving tweetid.csv to tweetid.csv


In [33]:
tweet = pd.read_csv('tweetid.csv')
tweet.head()

Unnamed: 0,tweetid
0,169760
1,35326
2,224985
3,476263
4,872928


In [34]:
output = pd.DataFrame({"tweetid":tweet['tweetid']})
testgs = output.join(testsub)
testgs.to_csv("testgs.csv", index=False)

In [35]:
testgs.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,1
