In [1]:
import pandas as pd
import numpy as np
import nltk

In [18]:
# Reading data from csv
train_data=pd.read_csv('Train.csv')
train_data.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,1,VENDOR-61,GL-6050100,6.973473,AETNA VARIABLE FUND - Apr-2002 - Store Managem...,CLASS-784
1,2,VENDOR-61,GL-6050100,25.053841,AETNA VARIABLE FUND - Nov-2000 - Store Managem...,CLASS-784
2,3,VENDOR-449,GL-6050100,53.573737,FAIRCHILD CORP - Nov-2001 - Store Management R...,CLASS-784
3,4,VENDOR-682,GL-6050100,67.388827,CALIFORNIA REAL ESTATE INVESTMENT TRUST - Aug-...,CLASS-784
4,5,VENDOR-682,GL-6050100,74.262047,CALIFORNIA REAL ESTATE INVESTMENT TRUST - Mar-...,CLASS-784


In [19]:
"""
Preprocessing the description of items.

"""
def preprocessor(text):
    __tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

    ## call it using tokenizer.tokenize
    tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)
    tokens = tokenizer.tokenize(text.lower())
    alphabet_tokens = [token for token in tokens if token.isalpha()]
    en_stopwords = set(nltk.corpus.stopwords.words('english'))
    non_stopwords = [word for word in alphabet_tokens if not word in en_stopwords]
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    stems = [str(stemmer.stem(word)) for word in non_stopwords]
    
    return stems

train_data['Item_Description'] = train_data['Item_Description'].apply(preprocessor)

In [6]:
"""
Converting the descriptions of items into vector form 

"""
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(lowercase = False, 
                                     tokenizer = lambda x: x, # because we already have tokens available
                                     stop_words = None, ## stop words removal already done from NLTK
                                     max_features = 150000, ## pick top 150K words by frequency
                                     ngram_range = (1, 2), ## we want bigrams now
                                     binary = False) ## we do not want as binary/boolean features


In [11]:
"""
Creating a pipeline to train the machine learning model

"""
from sklearn.pipeline import Pipeline
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score


c=Pipeline(steps=[('bow',bow_vectorizer),
                  ('tfidf',TfidfTransformer()),
                  ('lr',LogisticRegression(C=20))])


msk = np.random.rand(len(train_data)) < 0.75
train_X = train_data.Item_Description[msk]
test_X = train_data.Item_Description[~msk]
y=train_data['Product_Category']
train_y = y[msk]
test_y = y[~msk]

c.fit(train_X,train_y)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=150000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        str...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
#print(cross_val_score(clf,text_vec_train,y,cv=5))
print(accuracy_score(pd.Series(test_y).values,c.predict(test_X)))





0.9993201903467029


In [14]:
test_data=pd.read_csv('Test.csv')
test_data['Item_Description']=test_data['Item_Description'].apply(preprocessor)

In [15]:
test_data['Product_Category']=pd.Series(c.predict(test_data['Item_Description']))

In [16]:
print(test_data.head())

   Inv_Id  Vendor_Code     GL_Code    Inv_Amt  \
0       6  VENDOR-1197  GL-6050100  10.916343   
1      12   VENDOR-792  GL-6050100  38.658772   
2      14   VENDOR-792  GL-6050100  46.780476   
3      18   VENDOR-792  GL-6050100   7.058866   
4      19   VENDOR-792  GL-6050100  32.931765   

                                    Item_Description Product_Category  
0  [desoto, inc, store, manag, real, estat, real,...        CLASS-784  
1  [centuri, realti, trust, store, manag, real, e...        CLASS-784  
2  [centuri, realti, trust, store, manag, real, e...        CLASS-784  
3  [centuri, realti, trust, store, manag, real, e...        CLASS-784  
4  [centuri, realti, trust, store, manag, real, e...        CLASS-784  


In [17]:
test_data[['Inv_Id','Product_Category']].to_csv('output.csv',index=False)