In [1]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.data import load
from nltk.corpus import wordnet


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Data Import 

In [4]:
rand_state = 48

In [5]:
df = pd.read_csv("train1.csv",index_col = 0)
df.head()

Unnamed: 0_level_0,TITLE,TOPIC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,RITE AID CORP <RAD> SETS DIVIDEND,0
1,DEL E. WEBB INVESTMENT <DWPA> 4TH QTR NET,0
2,GENERAL HOST CORP <GH> SETS QUARTERLY,0
3,PROFESSOR LIFTS BANC TEXAS <BTX> PREFERRED STAKE,1
4,WINCHELL'S DONUT <WDH> SETS INITIAL QUARTERLY,0


In [6]:
#check if it's unbalance
df.groupby("TOPIC").count()

Unnamed: 0_level_0,TITLE
TOPIC,Unnamed: 1_level_1
0,3107
1,2406
2,2404


In [7]:
#check missing value
df[df["TITLE"].isna()]

Unnamed: 0_level_0,TITLE,TOPIC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1


# Feature Engineering

In [8]:
stop = set(stopwords.words("english"))
punc = set(string.punctuation)
punc.update(["--","jan","january","feb","february","mar","march","apr","april","may","jun","june","jul","july",
             "aug","august","sept","september","oct","october","nov","november","dec","december"])
lemma = WordNetLemmatizer()
treetags = load('help/tagsets/upenn_tagset.pickle').keys()

posdic = dict(map(lambda x: (x, wordnet.ADJ) if x.startswith('J')
 else((x, wordnet.ADV) if x.startswith('R')
    else((x, wordnet.VERB) if x.startswith('V')
         else(x, wordnet.NOUN))), treetags))


In [9]:
def clean(sent):
    sent = sent.lower().replace("<"," ").replace(">", " ").split()
    sent = [lemma.lemmatize(word, posdic[pos]) for (word,pos) in pos_tag(sent) 
            if (word.isalpha()) & (word not in stop) & (word not in punc) ]  
    return " ".join(word for word in sent)

In [10]:
df["FEATURE"] = [clean(row) for row in df["TITLE"]]

### TFIDF & Topic Modeling

In [11]:
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA


In [11]:
vectorizer  = CountVectorizer(min_df=2,ngram_range=(1,3))
n_gram = vectorizer.fit_transform(df["FEATURE"])
feature_names = vectorizer.get_feature_names()

In [12]:
n_gram

<7917x10091 sparse matrix of type '<class 'numpy.int64'>'
	with 63471 stored elements in Compressed Sparse Row format>

In [13]:
tfidf_gram = TfidfTransformer(norm = "l1").fit_transform(n_gram)

In [14]:
tfidf_gram

<7917x10091 sparse matrix of type '<class 'numpy.float64'>'
	with 63471 stored elements in Compressed Sparse Row format>

for i in range(2,7):
    lda = LDA(n_components=i,learning_method='batch', max_iter=100,random_state=rand_state).fit(n_gram)
    perplexity = lda.perplexity(n_gram)
    score = lda.score(n_gram)
    print('\nPerplexity of %d topics: ' %i, perplexity)
    print('score: ', score)

In [15]:
lda = LDA(n_components=3,learning_method='batch',random_state=rand_state).fit(n_gram)
lda_data = lda.transform(n_gram)

In [16]:
topics = lda.components_
topics

array([[2.1784386 , 0.33701549, 2.32817181, ..., 0.38196381, 0.41092557,
        2.31784797],
       [0.33486781, 2.32906653, 0.33368499, ..., 3.28391195, 2.15963721,
        0.3484502 ],
       [0.48669359, 0.33391798, 0.33814319, ..., 0.33412424, 0.42943723,
        0.33370182]])

In [17]:
num_words  = 10
for _, topic in enumerate(topics):
    print (",".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))

qtr,net,inc,qtr net,corp,loss,year,co,qtr loss,dividend
say,oil,bank,raise,price,unit,rate,pct,buy,set
pct,market,money,mln,stock,rise,see,japan,set,buy


# Training

In [74]:
param_grid = {
        'C': [0.1,1,10,50],
        'kernel': ['linear', 'rbf'],
        'gamma': [0.01, 0.1, 1]     
    }

In [75]:
grid = GridSearchCV(estimator = svm.SVC(), cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose = 20)

In [None]:
grid.fit(lda_data, y_train)

In [None]:
print_results(clf)

### Pipeline

In [None]:
def print_results(grid):
    print("best score: ", grid.best_score_)
    print("best parameters: ", grid.best_params_)

### Random Forest

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost

In [14]:
X_train = df["FEATURE"]
y_train = df["TOPIC"]

In [15]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',max_iter=10,random_state=rand_state)), 
               ('rf', RandomForestClassifier())]

In [16]:
pipe = Pipeline(memory=None,steps = estimators)

In [17]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,4),(1,3)],
        'vectorizer__min_df': [3, 4],
#         'lda__n_components': [5,7,9],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
        'rf__n_estimators': [1000,2000,2500],
        'rf__max_features': ["sqrt","log2"],
        'rf__max_depth':  [50,70],
        'rf__min_samples_split': [20,30],
        'rf__min_samples_leaf': [2, 3],
        'rf__bootstrap': [True]
        
    }
]

In [18]:
grid_rf = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [None]:
grid_rf.fit(X_train, y_train)

In [103]:
print_results(grid_rf)

best score:  0.8505747126436781
best parameters:  {'rf__n_estimators': 2000, 'tfidf__norm': 'l2', 'rf__bootstrap': True, 'vectorizer__min_df': 3, 'vectorizer__ngram_range': (1, 3), 'rf__min_samples_leaf': 2, 'rf__max_features': 'log2', 'rf__max_depth': 50, 'rf__min_samples_split': 20}


best score:  0.8505747126436781
best parameters:  {'rf__n_estimators': 2000, 'tfidf__norm': 'l2', 'rf__bootstrap': True, 'vectorizer__min_df': 3, 
        'vectorizer__ngram_range': (1, 3), 'rf__min_samples_leaf': 2, 'rf__max_features': 'log2', 
                   'rf__max_depth': 50, 'rf__min_samples_split': 20}

### Logistic Regression

In [83]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('logit', LogisticRegression())]

In [84]:
pipe = Pipeline(memory=None,steps = estimators)

In [85]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,5),(1,4),(1,3)],
        'vectorizer__min_df': [3,4],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
#         'lda__n_components': [8,10,12],
        'logit__penalty': ['l1', 'l2'], 
        'logit__C': [0.1,1,10]      
    }
]

In [86]:
grid_logit = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy", verbose=10)

In [None]:
grid_logit.fit(X_train, y_train)

In [88]:
print_results(grid_logit)

best score:  0.8880889225716811
best parameters:  {'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 3, 'tfidf__norm': 'l2', 'logit__penalty': 'l2', 'logit__C': 10}


### SVM-SVC

In [20]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('svc', svm.SVC())]

pipe = Pipeline(memory=None,steps = estimators)

In [24]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,2),(1,3),(1,4)],
        'vectorizer__min_df': [2,3],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
#         'lda__n_components': [4,7,10],
        'svc__C': [0.1,1,10],
        'svc__kernel': ['rbf'],
        'svc__degree': [2,4,6],
        'svc__gamma': [0.5, 1]
    
    }
]

In [25]:
grid_svc = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [None]:
grid_svc.fit(X_train, y_train)

In [27]:
print_results(grid_svc)

best score:  0.8947833775419982
best parameters:  {'svc__C': 1, 'svc__gamma': 1, 'svc__kernel': 'rbf', 'tfidf__norm': 'l2', 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 3, 'svc__degree': 2, 'tfidf__use_idf': 'True'}


### Stochastic Gradient Descent

In [138]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('sgd', SGDClassifier())]

pipe = Pipeline(memory=None,steps = estimators)

In [148]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,1),(1,2),(1,3)],
        'vectorizer__min_df': [3,4],
        'tfidf__norm': ['l1','l2'],
        'sgd__loss': ['hinge', 'log'],
        'sgd__penalty': ['l1','l2','elasticnet'],
        'sgd__alpha': [0.0001, 0.001, 0.01, 0.1]
    
    }
]

In [150]:
grid_sgd = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [None]:
grid_sgd.fit(X_train, y_train)

In [152]:
print_results(grid_sgd)

best score:  0.8890994063407857
best parameters:  {'tfidf__norm': 'l2', 'sgd__alpha': 0.0001, 'vectorizer__min_df': 3, 'vectorizer__ngram_range': (1, 3), 'sgd__loss': 'hinge', 'sgd__penalty': 'l2'}


### KNN

In [223]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('knn', KNeighborsClassifier())]

pipe = Pipeline(memory=None,steps = estimators)

In [224]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,2),(1,3)],
        'vectorizer__min_df': [2,3],
        'tfidf__norm': ['l1','l2'],
        'knn__n_neighbors': [10,25,40],
        'knn__weights': ['uniform','distance'],
        'knn__algorithm':['brute'],
#         'knn__leaf_size':[10,30,50],
        'knn__p':[1,2]
    }
]

In [225]:
grid_knn = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [None]:
grid_knn.fit(X_train, y_train)

In [186]:
print_results(grid_knn)

best score:  0.8556271314892004
best parameters:  {'tfidf__norm': 'l2', 'knn__weights': 'distance', 'knn__n_neighbors': 25, 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 3, 'knn__algorithm': 'ball_tree'}


### Naive Bayes

In [54]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('mnb', MultinomialNB())]

pipe = Pipeline(memory=None,steps = estimators)

In [55]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,2),(1,3),(1,4)],
        'vectorizer__min_df': [1,2,3],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
#         'lda__n_components': [4,7,10],
        'mnb__alpha': [0.1,0.5,1,10],
        'mnb__fit_prior': [True, False]
    
    }
]

In [56]:
grid_mnb = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy", verbose=10)

In [None]:
grid_mnb.fit(X_train, y_train)

In [222]:
print_results(grid_mnb)

best score:  0.8769736011115321
best parameters:  {'vectorizer__ngram_range': (1, 4), 'tfidf__norm': 'l2', 'mnb__alpha': 0.5, 'mnb__fit_prior': False, 'vectorizer__min_df': 1, 'tfidf__use_idf': 'True'}


### XGBoost

In [63]:
from xgboost import XGBClassifier

In [72]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('xgb', XGBClassifier(n_jobs=22, objective='multi:softmax', eval_metric = 'merror'))]
pipe = Pipeline(memory=None, steps = estimators)

In [73]:
param_grid = [
    {
#         'vectorizer__ngram_range': [(1,2),(1,3)],
#         'vectorizer__min_df': [2, 3, 4],
#         'tfidf__norm': ['l1','l2'],
#         'tfidf__use_idf': ['True','False'],
        'xgb__max_depth': [8,9,10],
        'xgb__learning_rate': [0.1],
        'xgb__n_estimators':[300,350,400],
        'xgb__gamma': [0.1],
        'xgb__subsample': [0.5,0.6]
    
    }
]

In [74]:
grid_xgb= GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy", verbose=10)

In [None]:
grid_xgb.fit(X_train, y_train)

In [46]:
print_results(grid_xgb)

best score:  0.8676266262473159
best parameters:  {'xgb__booster': 'gbtree', 'xgb__gamma': 0.1, 'xgb__n_estimators': 300, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.1}


# Prediction with SVM

In [31]:
df2 = pd.read_csv("test1.csv",index_col = 0)
df2.head()

Unnamed: 0_level_0,TITLE,TOPIC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,INDONESIAN COFFEE PRODUCTION MAY FALL THIS YEAR,
1,INTERNATIONAL BUSINESS MACHINES CORP <IBM> NET,
2,WESTERN TELE-COMMUNICATIONS <WTLCA> 4TH QTR NET,
3,NACCO INDUSTRIES <NC> TO REPORT 2ND QTR GAIN,
4,WALKER TELECOMMUNICATIONS CORP <WTEL> 4TH QTR,


In [32]:
X_pred = [clean(row) for row in df2["TITLE"]]

In [28]:
estimators = [('vectorizer', CountVectorizer(ngram_range=(1,2),min_df=3)), 
              ('tfidf',TfidfTransformer(use_idf=True, norm='l2')),
               ('svc', svm.SVC(C=1, gamma=1, kernel ='rbf',degree=2))]

pipe = Pipeline(memory=None,steps = estimators)

In [29]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [30]:
pipe.score(X_train, y_train)

0.9820639130983958

In [34]:
prediction = pipe.predict(X_pred)

In [37]:
df2["TOPIC"] = prediction

In [39]:
df2.groupby("TOPIC").count()

Unnamed: 0_level_0,TITLE
TOPIC,Unnamed: 1_level_1
0,1203
1,1095
2,1095


In [None]:
df.to_csv()