# Sentiment Analysis Project 
## - Sentiment Analysis on Movie Reviews

-------------------------------------

##  [ 목차 ]
   ### 1. 분석의 개요 및 목적
   ### 2. 데이터 set 분석
   ### 3. 텍스트 전처리
   ### 4. 모델링
   ### 5. Cross-validation
   ### 6. Submit

-------------------------------------

### 1. 분석의 개요 및 목적

* 로튼 토마토의 영화감상평 텍스트를 분석하여 해당 텍스트가 아래 5가지 감성 중 어느 카테고리에 해당하는지 분류

### 2. 데이터 set 분석

1. PhraseId : sequence하게 생성되는 고유 아이디
2. SentenceId : 같은 문장의 카테고리를 표시, Sentenceid가 같을 경우 같은 문장에 split된 문장
3. Phrase : 영화 감상평
4. Sentiment : 영화평에 대한 감성 카테고리
    * 0 - negative
    * 1 - somewhat negative
    * 2 - neutral
    * 3 - somewhat positive
    * 4 - positive

In [14]:
import pandas as pd
import numpy  as np

### 3. 텍스트 전처리

In [4]:
train = pd.read_csv("data/train.tsv", sep="\t", index_col="PhraseId")
test = pd.read_csv("data/test.tsv", sep="\t", index_col="PhraseId")

train.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
X_train = vectorizer.transform(train["Phrase"])
X_train

<156060x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 653246 stored elements in Compressed Sparse Row format>

In [8]:
vocabulary = vectorizer.get_feature_names()
pd.DataFrame(X_train[0:1000].toarray(), columns=vocabulary).head()

Unnamed: 0,10,20,2002,90,ability,able,about,above,across,act,...,written,wrong,ya,year,years,yet,york,you,young,your
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X_test = vectorizer.transform(test["Phrase"])
X_test

<66292x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 254492 stored elements in Compressed Sparse Row format>

In [10]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

### 4. 모델링

In [11]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=37)
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### 5. Cross validation

In [12]:
from sklearn.model_selection import cross_val_predict

y_predict = cross_val_predict(model, X_train, y_train, cv=5)
y_predict



array([1, 3, 2, ..., 2, 2, 2], dtype=int64)

In [16]:
result = train.copy()
result["Sentiment(Predict)"] = y_predict

result["Distance"] = \
    np.abs(result["Sentiment"] - result["Sentiment(Predict)"])

result = result.sort_values(by="Distance", ascending=False)
result.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Sentiment(Predict),Distance
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
101171,5317,With Zoe Clarke-Williams 's lackluster thrille...,0,4,4
132656,7153,"Miyazaki 's nonstop images are so stunning , a...",4,0,4
30341,1409,"the 2002 film does n't really believe in it , ...",0,4,4
85477,4421,K-19 will not go down in the annals of cinema ...,0,4,4
48083,2345,may be galled that you 've wasted nearly two h...,0,4,4


In [17]:
model.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [19]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)

print("Score = {0:.6f}".format(score))

Score = 0.557061


In [18]:
predictions = model.predict(X_test)
predictions

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

### Submit

In [20]:
submit = pd.read_csv("data/sampleSubmission.csv", index_col="PhraseId")
submit.head()

Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


In [21]:
submit["Sentiment"] = predictions
submit.head()

Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,2
156062,2
156063,2
156064,2
156065,2


In [22]:
submit.to_csv("data/sampleSubmission.csv")