In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [6]:
# importing data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [7]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [10]:
# set up natural language processor
nlp = spacy.load('en_core_web_lg')

In [266]:
# creating vectorizor 
vectorizor = TfidfVectorizer(stop_words='english', lowercase=True, min_df=0.0125, max_df=0.90)

# creating classifier
classifier = GradientBoostingClassifier(n_estimators=2000)

# creating single value decomposition
svd = TruncatedSVD(n_iter=1000, algorithm='randomized')

In [267]:
# makes latetent semantic indexing pipeline 
lsi = Pipeline([('vect', vectorizor), ('svd', svd)])

In [268]:
# makes pipeline
pipe = Pipeline([('lsi', lsi), ('clf', classifier)])

In [269]:
# sets parameters for grid
params = {'lsi__vect__max_features': (700, 725, 750),
          'lsi__svd__n_components': (15, 16, 17)
          }

In [270]:
# sets up grid search params, fits to the training set, returns best score and parameters
grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1)
grid_search.fit(train['description'], train['category'])
print('best Score: ', grid_search.best_score_)
grid_search.best_params_

best Score:  0.9033255993812839


{'lsi__svd__n_components': 16, 'lsi__vect__max_features': 750}

In [271]:
# returns best_estimator and sets it as the model
mod = grid_search.best_estimator_

In [272]:
# creates predictions using the best estimator model
predictions = mod.predict(test['description'])

In [273]:
# creating submission

submission = pd.DataFrame({'id': test['id'], 'category': predictions})
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,1
3,1024,1
4,1902,1


In [274]:
# creates submission file
submission.to_csv('data/submission.csv', index=0)

In [275]:
# imports submission and checks that it is the correct format
check = pd.read_csv('data/submission.csv')

check.equals(submission)

True

In [1]:
import pandas as pd
check = pd.read_csv('data/submission.csv')

In [2]:
len(check)

288