In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack

In [2]:
random_state = 2018
path = "~/.kaggle/competitions/sentiment-analysis-on-movie-reviews/"

In [3]:
train_data = pd.read_csv(path + "train.tsv", sep="\t", low_memory=False)
test_data = pd.read_csv(path + "test.tsv", sep="\t", low_memory=False)
submit = pd.read_csv(path + "sampleSubmission.csv", sep=',', low_memory=False)

In [4]:
train_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
train_data['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

## [LR with words and char n-grams](https://www.kaggle.com/tunguz/lr-with-words-and-char-n-grams)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [8]:
# 提取文本信息
train_text = train_data['Phrase']
test_text = test_data['Phrase']

In [9]:
# 先将训练集和测试集的文本拼接起来。
all_text = pd.concat([train_text, test_text])

In [10]:
# 字级别的属性
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 3),
    max_features=18000)
word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=18000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
# 特征。
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [12]:
# 字符级别的属性
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 7),
    max_features=70000)
char_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=60000, min_df=1,
        ngram_range=(1, 7), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
# 特征。
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [14]:
# 特征合并。
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [15]:
train_features

<156060x78000 sparse matrix of type '<class 'numpy.float64'>'
	with 27859784 stored elements in COOrdinate format>

In [16]:
test_features

<66292x78000 sparse matrix of type '<class 'numpy.float64'>'
	with 10965724 stored elements in COOrdinate format>

## 数据划分

In [17]:
train_target = train_data['Sentiment']

In [18]:
train_target.values

array([1, 2, 2, ..., 3, 2, 2])

## LR

In [19]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='l1')

In [20]:
clf.fit(train_features, train_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [32]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [42]:
param_grid = {
    'C':[0.01, 0.1, 0.2, 0.5, 1, 2],
}

In [43]:
rskf = RepeatedStratifiedKFold(n_repeats=10,n_splits=10, random_state=random_state)

In [45]:
gscv = GridSearchCV(estimator=clf, cv=rskf, n_jobs=-1, iid=True, scoring='f1', param_grid = param_grid)

In [24]:
pred = clf.predict(test_features)

In [37]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
submit['Sentiment'] = pred

In [29]:
submit.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3


In [30]:
submit['Sentiment'].value_counts()

2    44511
3    11244
1     8040
4     1582
0      915
Name: Sentiment, dtype: int64