# MNIST

In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv('../static/data/digits_train.csv')
df_train.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,0.0,11.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,15.0,15.0,3.0,0.0,6
1,0.0,0.0,3.0,15.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,13.0,14.0,5.0,0.0,0.0,0
2,0.0,0.0,1.0,14.0,15.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,12.0,16.0,16.0,9.0,0.0,6
3,0.0,0.0,4.0,13.0,16.0,14.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,4.0,16.0,5.0,0.0,0.0,0.0,5
4,0.0,2.0,13.0,16.0,7.0,0.0,0.0,0.0,0.0,12.0,...,0.0,0.0,1.0,16.0,14.0,13.0,16.0,9.0,0.0,2


In [3]:
df_train.describe()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
count,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,...,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0,1347.0
mean,0.0,0.317001,5.18931,11.870082,11.788419,5.715664,1.348181,0.1366,0.004454,1.933927,...,0.219005,0.000742,0.302153,5.541945,12.105419,11.788419,6.734967,2.025984,0.348181,4.444692
std,0.0,0.964311,4.754503,4.231442,4.292838,5.636599,3.297045,1.085794,0.086079,3.164288,...,1.045188,0.027247,1.006673,5.113887,4.368861,4.930023,5.895851,4.033671,1.807961,2.887519
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0,4.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,9.5,16.0,16.0,12.0,2.0,0.0,7.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,15.0,...,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0,9.0


In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
y_train = df_train.target.values
X_train = scaler.fit_transform(df_train.drop('target', axis=1))
df_test = pd.read_csv('../static/data/digits_test.csv')
y_test = df_test.target.values
X_test = scaler.fit_transform(df_test.drop(columns=['target', 'index'], axis=1))
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1347, 64), (1347,), (450, 64), (450,))

### Logistic Regression

In [5]:
lr = LogisticRegression()
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [6]:
params = {
    'C': [10, 12, 14, 16]
}

In [7]:
grid_cv = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9681
최적 파라미터: {'C': 12}


In [8]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

In [9]:
joblib.dump(best_lr, '../static/model/digits_lr.pkl')

['../static/model/digits_lr.pkl']

### SVM

In [10]:
svc = SVC()
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [11]:
params = {
    'C': [2, 3, 4, 5]
}

In [12]:
grid_cv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9926
최적 파라미터: {'C': 4}


In [13]:
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test)
accuracy_score(y_test, pred)

0.9866666666666667

In [14]:
joblib.dump(best_sv, '../static/model/digits_sv.pkl')

['../static/model/digits_sv.pkl']

### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [16]:
params = {
    'max_depth': [13, 14, 15, 16],
    'min_samples_split': [3, 4, 5, 6]
}

In [17]:
grid_cv = GridSearchCV(rf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9762
최적 파라미터: {'max_depth': 15, 'min_samples_split': 3}


In [18]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test)
accuracy_score(y_test, pred)

0.9733333333333334

In [19]:
joblib.dump(best_rf, '../static/model/digits_rf.pkl')

['../static/model/digits_rf.pkl']

# 20newsgroups

In [17]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [18]:
train_news = fetch_20newsgroups(subset='train', random_state=2021, remove=('headers', 'footers', 'quotes'))
df_train = pd.DataFrame(train_news.data)
df_train['target'] = train_news.target
df_train

Unnamed: 0,0,target
0,\nStop! Hold it! You have a few problems here....,19
1,"]Is it possible to do a ""wheelie"" on a motorcy...",8
2,\n\nBBS number\n510-226-2365,2
3,: [first post I've seen from the ol' Bug-Zoo (...,0
4,Archive-name: rec-autos/part5\n\n[this article...,7
...,...,...
11309,While I enjoy the trend towards the more class...,9
11310,\nyou can say that again.\nhow does $23 for a ...,7
11311,"If you can get it, you might want to try a Can...",13
11312,"\n\nWhy would you say ""especially Christianity...",15


In [19]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11314 entries, 0 to 11313
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       11314 non-null  object
 1   target  11314 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 132.7+ KB


In [20]:
df_train[df_train[0] == ''].count()

0         218
target    218
dtype: int64

In [21]:
df_train = df_train.drop(df_train[df_train[0] == ''].index)
df_train

Unnamed: 0,0,target
0,\nStop! Hold it! You have a few problems here....,19
1,"]Is it possible to do a ""wheelie"" on a motorcy...",8
2,\n\nBBS number\n510-226-2365,2
3,: [first post I've seen from the ol' Bug-Zoo (...,0
4,Archive-name: rec-autos/part5\n\n[this article...,7
...,...,...
11309,While I enjoy the trend towards the more class...,9
11310,\nyou can say that again.\nhow does $23 for a ...,7
11311,"If you can get it, you might want to try a Can...",13
11312,"\n\nWhy would you say ""especially Christianity...",15


In [22]:
df_train.values.shape

(11096, 2)

In [23]:
test_news = fetch_20newsgroups(subset='test', random_state=2021, remove=('headers', 'footers', 'quotes'))
df_test = pd.DataFrame(test_news.data)
df_test['target'] = test_news.target
df_test

Unnamed: 0,0,target
0,Need Diet for Diverticular Disease\nand ideas ...,13
1,There are chips which perform the voice compre...,11
2,"Total Baseball, which also tries to evaluate a...",9
3,If anyone would like to get rid of their SegaC...,6
4,"\nWhat is ""aluminium siding""? I keep seeing r...",19
...,...,...
7527,"\n\nI know you work at sun, but that's really ...",3
7528,"\nIMO, the influence of Stalin, or for that ma...",0
7529,My 486DX2-50 has 8MB of 70ns RAM and a Trident...,3
7530,"[argument over ""reasonable"" players and umpire...",9


In [24]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7532 entries, 0 to 7531
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7532 non-null   object
 1   target  7532 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 88.4+ KB


In [25]:
df_test[df_test[0] == ''].count()

0         162
target    162
dtype: int64

In [26]:
df_test = df_test.drop(df_test[df_test[0] == ''].index)
df_test

Unnamed: 0,0,target
0,Need Diet for Diverticular Disease\nand ideas ...,13
1,There are chips which perform the voice compre...,11
2,"Total Baseball, which also tries to evaluate a...",9
3,If anyone would like to get rid of their SegaC...,6
4,"\nWhat is ""aluminium siding""? I keep seeing r...",19
...,...,...
7527,"\n\nI know you work at sun, but that's really ...",3
7528,"\nIMO, the influence of Stalin, or for that ma...",0
7529,My 486DX2-50 has 8MB of 70ns RAM and a Trident...,3
7530,"[argument over ""reasonable"" players and umpire...",9


In [27]:
df_test.values.shape

(7370, 2)

In [30]:
df_test.to_csv('../static/data/news_test.csv')

In [28]:
X_train = df_train[0].values.reshape(1, -1)[0].tolist()
X_test = df_test[0].values.reshape(1, -1)[0].tolist()
y_train = df_train['target'].values
y_test = df_test['target'].values
len(X_train), y_train.shape

(11096, (11096,))

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(X_train)
X_train_count = cv.transform(X_train)
X_test_count = cv.transform(X_test)
X_train_count.shape

(11096, 101631)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
tv.fit(X_train)
X_train_tfidf = tv.transform(X_train)
X_test_tfidf = tv.transform(X_test)
X_train_tfidf.shape

(11096, 101631)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
pipeline_tvlr = Pipeline([
    ('tv', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [34]:
params ={
    'tv__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tv__max_df': [500, 700],
    'lr__C': [10, 20]
}

In [18]:
from sklearn.model_selection import GridSearchCV
grid_pipe = GridSearchCV(pipeline_tvlr, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

# IMDB 영화평 감성분석

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../static/data/IMDB/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
import re
df['review'] = df.review.str.replace('<br />', ' ')
df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [4]:
from sklearn.model_selection import train_test_split

feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(feature_df, df.sentiment, test_size=0.25, random_state=2021)
X_train.shape, X_test.shape

((18750, 1), (6250, 1))

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

pipeline_cl = Pipeline([
    ('cv', CountVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [8]:
params = {
    'cv__ngram_range': [(1, 2), (1, 3)],
    'cv__max_df': [900, 1000],
    'lr__C': [10, 20]
}

In [9]:
from sklearn.model_selection import GridSearchCV
grid_pipe = GridSearchCV(pipeline_cl, param_grid=params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 10.4min finished
{'cv__max_df': 1000, 'cv__ngram_range': (1, 2), 'lr__C': 10} 0.8768533333333334


In [10]:
pred = grid_pipe.predict(X_test.review)
accuracy_score(y_test, pred)

0.88064

In [11]:
import joblib
best_cvlr = grid_pipe.best_estimator_
joblib.dump(best_cvlr, '../static/model/imdb_cvlr.pkl')

['../static/model/imdb_cvlr.pkl']

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
pipeline_tl = Pipeline([
    ('tv', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [8]:
params = {
    'tv__ngram_range': [(1, 1), (1, 2)],
    'tv__max_df': [700, 900],
    'lr__C': [10, 20]
}

In [9]:
from sklearn.model_selection import GridSearchCV
grid_pipe = GridSearchCV(pipeline_tl, param_grid=params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.7min finished
{'lr__C': 20, 'tv__max_df': 900, 'tv__ngram_range': (1, 2)} 0.8869333333333334


In [10]:
best_tvlr = grid_pipe.best_estimator_
pred = best_tvlr.predict(X_test.review)
accuracy_score(y_test, pred)

0.888

In [12]:
import joblib
joblib.dump(best_tvlr, '../static/model/imdb_tvlr.pkl')

['../static/model/imdb_tvlr.pkl']

In [13]:
type(X_test)

pandas.core.frame.DataFrame

In [14]:
X_test.to_csv('../static/data/imdb_test.csv')

In [15]:
pred

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [16]:
df = pd.read_csv('../static/data/imdb_test.csv')
index = 10
X = df.loc[index, 'review']
X

' I know that some films  I mean  European films   that are very bad films  are being regarded as great cinema by certain   critics    only because they re non American  I saw the     IMDB score for this film and noticed the fact that this was being selected for certain big festivals  Don t let this fool you  Unless you re one of those people that likes mind numbing films like this  and call it great art afterwards  skip it  The film contains one hilarious scene after another  a similar  Italian  film popped into my mind  the terrible PREFERISCO IL RUMORE DEL MARE  I prefer the sound of the sea    The problem with these films is that they re not only boring  like some other strangely praised films  but that they almost play like camp  I mean  let s face it  the acting is horrible  I mean  soap opera level   the story has not one surprise  this has been done endless times before  connecting several storylines  SHORT CUTS  MAGNOLIA  PLAYING BY HEART  only much better   not one realistic 