In [129]:
import csv
import pandas as pd
from pandas import DataFrame
import numpy as np

# read data

In [133]:
pd.set_option('display.max_columns',50)
train_raw=pd.read_csv('train.tsv',delimiter='\t',encoding='utf-8')
print(list(train_raw.columns.values)) #file header
print(train_raw.head(5))

['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
   PhraseId  SentenceId  \
0         1           1   
1         2           1   
2         3           1   
3         4           1   
4         5           1   

                                                                                                                                                                                         Phrase  \
0  A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .   
1                                                                                                                 A series of escapades demonstrating the adage that what is good for the goose   
2                                                                                                                                                                                      A series   
3      

In [20]:
train_raw.Phrase[0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [21]:
train_raw['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [140]:
test_raw=pd.read_csv('test.tsv',delimiter='\t',encoding='utf-8')
print(test_raw.head(5))

   PhraseId  SentenceId  \
0    156061        8545   
1    156062        8545   
2    156063        8545   
3    156064        8545   
4    156065        8545   

                                                   Phrase  
0  An intermittently pleasing but mostly routine effort .  
1    An intermittently pleasing but mostly routine effort  
2                                                      An  
3       intermittently pleasing but mostly routine effort  
4              intermittently pleasing but mostly routine  


## clean data

In [150]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import *
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [58]:
phrase_clean=[]
for i in range(len(train_raw)):
    # filter out punctuation
    text=train_raw.Phrase[i]
    tokens = word_tokenize(text)
    # convert to lower case
    tokens=[w.lower() for w in tokens]
    words = [word for word in tokens if word.isalpha()]
    # lemmatize the data
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    phrase_clean.append(' '.join(word for word in stemmed))


In [142]:
# clean test data
test_phrase_clean=[]
for i in range(len(test_raw)):
    # filter out punctuation
    text=test_raw.Phrase[i]
    tokens = word_tokenize(text)
    # convert to lower case
    tokens=[w.lower() for w in tokens]
    words = [word for word in tokens if word.isalpha()]
    # lemmatize the data
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    test_phrase_clean.append(' '.join(word for word in stemmed))

In [143]:
# test data
test_phrase_tidy=DataFrame(test_phrase_clean)
test_clean=pd.concat([test_raw,test_phrase_tidy],axis=1,sort=False)
test_clean=test_clean.rename(columns={0:'clean_review'})
test_clean.head(1)

Unnamed: 0,PhraseId,SentenceId,Phrase,clean_review
0,156061,8545,An intermittently pleasing but mostly routine effort .,an intermitt pleas but mostli routin effort


In [134]:
#train data
phrase_tidy=DataFrame(phrase_clean)
train_clean=pd.concat([train_raw,phrase_tidy],axis=1,sort=False)
train_clean=train_clean.rename(columns={0:'clean_review'})
train_clean.head(1)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1,a seri of escapad demonstr the adag that what is good for the goos is also good for the gander some of which occasion amus but none of which amount to much of a stori


## TF-IDF

In [164]:
vectorizer=TfidfVectorizer()
full_text=list(train_clean['clean_review'].values)+list(test_clean['clean_review'].values)
vectorizer.fit(full_text)
df_upsampled_vectorized=vectorizer.transform(train_clean['clean_review'])
test_vectorized=vectorizer.transform(test_clean['clean_review'])
test1=test_clean['clean_review']


In [165]:
logreg=LogisticRegression()
ovr=OneVsRestClassifier(logreg)
ovr.fit(df_upsampled_vectorized, train_clean['Sentiment'])
scores=cross_val_score(ovr,df_upsampled_vectorized,train_clean['Sentiment'],scoring='accuracy', n_jobs=-1,cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:2f}.'.format(np.mean(scores)*100,np.std(scores)*100))

Cross-validation mean accuracy 58.49%, std 0.068792.


In [175]:
pred=ovr.fit(df_upsampled_vectorized, train_clean['Sentiment']).predict(test_vectorized)
pred_df=DataFrame(pred)
pred_df=pred_df.rename(columns={0:'Sentiment'})

## submission

In [182]:
sub_df=pd.DataFrame({'PhraseId': test_raw.PhraseId, 'Sentiment': pred_df.Sentiment})
sub_df.to_csv('sentiment_prediction.csv',index=False)
sub_df.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3
