In [1]:
import pandas as pd
df = pd.read_csv('./kaggle/sample_submission.csv')
df.head()

Unnamed: 0,id,category
0,955,1
1,3532,3
2,1390,2
3,1024,4
4,1902,2


In [13]:
train = pd.read_csv('./kaggle/train.csv')
train.shape

(2874, 7)

In [15]:
train = train.dropna()
train.shape

(2476, 7)

In [16]:
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

tokens = []

STOP_WORDS = nlp.Defaults.stop_words

for doc in tokenizer.pipe(train['description'], batch_size=500):
    doc_tokens = []
    
    for token in doc:
        if token.text not in STOP_WORDS:
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
    
train['tokens'] = tokens

In [5]:
train['tokens'].head()

0    [a, marriage, 13, 18, year, old, bourbons., a,...
1    [there, legendary, bowmores, mid-60s, bit, equ...
2    [this, bottling, celebrates, master, distiller...
3    [what, impresses, whisky, evolves;, it's, incr...
4    [after, 40, years, barrels,, trademark, canadi...
Name: tokens, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(train['description'])
dtm = vectorizer.transform(train['description'])

In [8]:
dtm.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
train.head()

<bound method spmatrix.get_shape of <2874x9450 sparse matrix of type '<class 'numpy.int64'>'
	with 114509 stored elements in Compressed Sparse Row format>>

In [27]:
data[0][:10]

['a',
 'marriage',
 '13',
 '18',
 'year',
 'old',
 'bourbons.',
 'a',
 'mature',
 'elegant']

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
dtm = tfidf.fit_transform(train['description'])
docs = pd.DataFrame(dtm.todense(), columns = tfidf.get_feature_names())

In [19]:
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import SGDClassifier

In [20]:
sgdc = SGDClassifier()
pipe = Pipeline([('vect', tfidf), ('classifier', sgdc)])

In [22]:
pipe.fit(train['description'], train['category'])

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=5000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patte...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
           

In [33]:
test = pd.read_csv('./kaggle/test.csv')
submission = pipe.predict(test['description'])

In [34]:
test.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [35]:
submission_df = pd.DataFrame({'id':test['id'], 'category':submission})
submission_df.head()

Unnamed: 0,id,category
0,955,2.0
1,3532,2.0
2,1390,4.0
3,1024,1.0
4,1902,1.0


In [36]:
submission_df['category'] = submission_df['category'].astype('int64')

In [37]:
submission_df.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


In [40]:
submission_df.to_csv('./submission.csv',index=False)