In [57]:
import pandas as pd
import numpy as np
import re #usful for searching words in pharagraph
from nltk.corpus import stopwords  #words doesn't add much value to phara (rticals)
from nltk.stem.porter import PorterStemmer #gives a root word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer #convert text in to feature vectors
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [58]:
df=pd.read_excel('dataset1.xlsx')

In [59]:
df.head()

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference,label
0,H,61004060,,Export AF - Correction for AI-DN-20210615,Freight inwards charges - Air,This account is used to record the freight cha...,AF PAYMENT,0
1,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018288,0
2,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017939,0
3,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018434,0
4,H,34001010,304285.0,Import Charges - AIR,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017842,1


In [60]:
len(df['item_text'].unique())

774

In [61]:
df.columns

Index(['dc_indicator', 'gl', 'vendor', 'item_text', 'gl_acc_name',
       'gl_acc_description', 'document_reference', 'label'],
      dtype='object')

In [62]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [64]:
df.shape

(5000, 8)

In [65]:
df.isnull().sum()

dc_indicator          0
gl                    0
vendor                0
item_text             0
gl_acc_name           0
gl_acc_description    0
document_reference    0
label                 0
dtype: int64

In [66]:
df.dtypes

dc_indicator          object
gl                     int64
vendor                object
item_text             object
gl_acc_name           object
gl_acc_description    object
document_reference    object
label                  int64
dtype: object

In [67]:
df["messages"] = df["item_text"].astype(str) + " " + df["gl_acc_name"] + " " + df["gl_acc_description"]

In [68]:
df["messages"]

0       Export AF -  Correction for AI-DN-20210615 Fre...
1       Credit Note Freight inwards charges - Air This...
2       Credit Note Freight inwards charges - Air This...
3       Credit Note Freight inwards charges - Air This...
4       Import Charges - AIR Freight inwards charges -...
                              ...                        
4995    FEIGTH CHARGES- AIR Freight inwards charges - ...
4996    FEIGTH CHARGES- AIR Freight inwards charges - ...
4997    FEIGTH CHARGES- AIR Freight inwards charges - ...
4998    FEIGTH CHARGES- AIR Freight inwards charges - ...
4999    FEIGTH CHARGES- AIR Freight inwards charges - ...
Name: messages, Length: 5000, dtype: object

In [69]:
#seperate the data and label
X=df.drop(columns="label",axis=1) #axis=1 "removing a column"
y=df["label"]

In [70]:
X

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference,messages
0,H,61004060,,Export AF - Correction for AI-DN-20210615,Freight inwards charges - Air,This account is used to record the freight cha...,AF PAYMENT,Export AF - Correction for AI-DN-20210615 Fre...
1,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018288,Credit Note Freight inwards charges - Air This...
2,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017939,Credit Note Freight inwards charges - Air This...
3,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018434,Credit Note Freight inwards charges - Air This...
4,H,34001010,304285,Import Charges - AIR,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017842,Import Charges - AIR Freight inwards charges -...
...,...,...,...,...,...,...,...,...
4995,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110281,FEIGTH CHARGES- AIR Freight inwards charges - ...
4996,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110281A,FEIGTH CHARGES- AIR Freight inwards charges - ...
4997,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110376,FEIGTH CHARGES- AIR Freight inwards charges - ...
4998,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110377,FEIGTH CHARGES- AIR Freight inwards charges - ...


In [18]:
y

0       0
1       0
2       0
3       0
4       1
       ..
4995    1
4996    1
4997    1
4998    1
4999    1
Name: label, Length: 5000, dtype: int64

In [71]:
port_stem=PorterStemmer()

In [72]:
def stemming(content):
    stemmed_content=re.sub("[^a-zA-z]"," ",content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")]
    stemmed_content= " ".join(stemmed_content)
    return stemmed_content

In [73]:
df["messages"] = df["messages"].apply(stemming)

In [74]:
print(df["messages"])

0       export af correct ai dn freight inward charg a...
1       credit note freight inward charg air account u...
2       credit note freight inward charg air account u...
3       credit note freight inward charg air account u...
4       import charg air freight inward charg air acco...
                              ...                        
4995    feigth charg air freight inward charg air acco...
4996    feigth charg air freight inward charg air acco...
4997    feigth charg air freight inward charg air acco...
4998    feigth charg air freight inward charg air acco...
4999    feigth charg air freight inward charg air acco...
Name: messages, Length: 5000, dtype: object


In [75]:
df.head()

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference,label,messages
0,H,61004060,,Export AF - Correction for AI-DN-20210615,Freight inwards charges - Air,This account is used to record the freight cha...,AF PAYMENT,0,export af correct ai dn freight inward charg a...
1,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018288,0,credit note freight inward charg air account u...
2,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017939,0,credit note freight inward charg air account u...
3,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018434,0,credit note freight inward charg air account u...
4,H,34001010,304285.0,Import Charges - AIR,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017842,1,import charg air freight inward charg air acco...


In [162]:
X=df["messages"].values
y=df["label"].values

In [163]:
X

array(['export af correct ai dn freight inward charg air account use record freight charg incur import raw materi product air',
       'credit note freight inward charg air account use record freight charg incur import raw materi product air',
       'credit note freight inward charg air account use record freight charg incur import raw materi product air',
       ...,
       'feigth charg air freight inward charg air account use record freight charg incur import raw materi product air',
       'feigth charg air freight inward charg air account use record freight charg incur import raw materi product air',
       'feigth charg air freight inward charg air account use record freight charg incur import raw materi product air'],
      dtype=object)

In [164]:
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [165]:
#converting data to numerical data

vectorizer=TfidfVectorizer(max_features=200 , ngram_range=(1,3))
X=vectorizer.fit_transform(X)

In [166]:
print(X)

  (0, 136)	0.14433756729740646
  (0, 168)	0.14433756729740646
  (0, 119)	0.14433756729740646
  (0, 122)	0.14433756729740646
  (0, 46)	0.14433756729740646
  (0, 79)	0.14433756729740646
  (0, 171)	0.14433756729740646
  (0, 196)	0.14433756729740646
  (0, 2)	0.14433756729740646
  (0, 5)	0.14433756729740646
  (0, 41)	0.14433756729740646
  (0, 125)	0.14433756729740646
  (0, 89)	0.14433756729740646
  (0, 156)	0.14433756729740646
  (0, 135)	0.14433756729740646
  (0, 167)	0.14433756729740646
  (0, 118)	0.14433756729740646
  (0, 121)	0.14433756729740646
  (0, 45)	0.14433756729740646
  (0, 77)	0.14433756729740646
  (0, 170)	0.14433756729740646
  (0, 195)	0.14433756729740646
  (0, 1)	0.14433756729740646
  (0, 4)	0.14433756729740646
  (0, 40)	0.14433756729740646
  :	:
  (4999, 135)	0.07190787721243505
  (4999, 167)	0.07190787721243505
  (4999, 118)	0.07190787721243505
  (4999, 121)	0.07190787721243505
  (4999, 45)	0.07190787721243505
  (4999, 77)	0.07190787721243505
  (4999, 170)	0.0719078772124350

In [167]:
X.shape

(5000, 200)

In [168]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

In [169]:
log_reg=LogisticRegression()

In [170]:
log_reg.fit(X_train,y_train)

LogisticRegression()

In [171]:
#accuracy on train data

X_train_pred=log_reg.predict(X_train)
acc = accuracy_score(X_train_pred,y_train)

In [172]:
print("train data accuracy : ",acc*100,"%")

train data accuracy :  98.05 %


In [173]:
#accuracy on test data

X_test_pred=log_reg.predict(X_test)
acc=accuracy_score(X_test_pred,y_test)

In [174]:
print("train data accuracy : ",acc*100,"%")

train data accuracy :  98.8 %


In [175]:
log_reg.predict(X_test[:30])

array([1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0], dtype=int64)

In [176]:
print(y_test[:30])

[1 0 1 0 0 0 1 1 1 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 1 0]


In [177]:
#### trying bring up a test file

df_test=pd.read_excel('dataset1_test.xlsx')

In [178]:
df_test.head()

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference,label
0,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110377,1
1,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110672A,1
2,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110673,1
3,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110720,1
4,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110721,1


In [179]:
df_test.shape

(674, 8)

In [180]:
df_test.isnull().sum()

dc_indicator          0
gl                    0
vendor                0
item_text             0
gl_acc_name           0
gl_acc_description    0
document_reference    0
label                 0
dtype: int64

In [181]:
df_test["messages_test"] = df_test["item_text"].astype(str) + " " + df_test["gl_acc_name"] + " " + df_test["gl_acc_description"]

In [182]:
df_test["messages_test"]

0      FEIGTH CHARGES- AIR Freight inwards charges - ...
1      FEIGTH CHARGES- AIR Freight inwards charges - ...
2      FEIGTH CHARGES- AIR Freight inwards charges - ...
3      FEIGTH CHARGES- AIR Freight inwards charges - ...
4      FEIGTH CHARGES- AIR Freight inwards charges - ...
                             ...                        
669    Import Air Freight Freight inwards charges - A...
670    IMPORT AIR Freight inwards charges - Air This ...
671    IMPORT AIR FREIGHT Freight inwards charges - A...
672    IMPORT AIR FREIGHT Freight inwards charges - A...
673    IMPORT AIR Freight inwards charges - Air This ...
Name: messages_test, Length: 674, dtype: object

In [183]:
#seperate the data and label
X_testing =df.drop(columns="label",axis=1) #axis=1 "removing a column"

In [184]:
X_testing

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference,messages
0,H,61004060,,Export AF - Correction for AI-DN-20210615,Freight inwards charges - Air,This account is used to record the freight cha...,AF PAYMENT,export af correct ai dn freight inward charg a...
1,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018288,credit note freight inward charg air account u...
2,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017939,credit note freight inward charg air account u...
3,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018434,credit note freight inward charg air account u...
4,H,34001010,304285,Import Charges - AIR,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017842,import charg air freight inward charg air acco...
...,...,...,...,...,...,...,...,...
4995,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110281,feigth charg air freight inward charg air acco...
4996,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110281A,feigth charg air freight inward charg air acco...
4997,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110376,feigth charg air freight inward charg air acco...
4998,H,34002010,402349,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...,110377,feigth charg air freight inward charg air acco...


In [185]:
df_test["messages_test"] = df_test["messages_test"].apply(stemming)

In [186]:
print(df_test["messages_test"])

0      feigth charg air freight inward charg air acco...
1      feigth charg air freight inward charg air acco...
2      feigth charg air freight inward charg air acco...
3      feigth charg air freight inward charg air acco...
4      feigth charg air freight inward charg air acco...
                             ...                        
669    import air freight freight inward charg air ac...
670    import air freight inward charg air account us...
671    import air freight freight inward charg air ac...
672    import air freight freight inward charg air ac...
673    import air freight inward charg air account us...
Name: messages_test, Length: 674, dtype: object


In [187]:
X_testing=df_test["messages_test"].values
y_testing=df_test["label"].values

In [188]:
#converting data to numerical data

vectorizer=TfidfVectorizer(max_features=200 , ngram_range=(1,3))
X_testing=vectorizer.fit_transform(X_testing)

In [189]:
print(X_testing)

  (0, 124)	0.11401878697385624
  (0, 163)	0.11401878697385624
  (0, 85)	0.11401878697385624
  (0, 88)	0.11401878697385624
  (0, 25)	0.11401878697385624
  (0, 61)	0.11401878697385624
  (0, 166)	0.11401878697385624
  (0, 196)	0.11401878697385624
  (0, 2)	0.11401878697385624
  (0, 9)	0.11401878697385624
  (0, 22)	0.11401878697385624
  (0, 91)	0.11401878697385624
  (0, 69)	0.11401878697385624
  (0, 13)	0.18287656206768774
  (0, 23)	0.1959589514976437
  (0, 54)	0.19841106292934876
  (0, 143)	0.11401878697385624
  (0, 123)	0.11401878697385624
  (0, 162)	0.11401878697385624
  (0, 84)	0.11401878697385624
  (0, 87)	0.11401878697385624
  (0, 24)	0.11401878697385624
  (0, 59)	0.11401878697385624
  (0, 165)	0.11401878697385624
  (0, 195)	0.11401878697385624
  :	:
  (673, 162)	0.12094336549695937
  (673, 84)	0.12094336549695937
  (673, 87)	0.12094336549695937
  (673, 24)	0.12094336549695937
  (673, 59)	0.12094336549695937
  (673, 165)	0.12094336549695937
  (673, 195)	0.12094336549695937
  (673, 1)	

In [190]:
X_testing.shape

(674, 200)

In [191]:
#accuracy on external data

X_testing_pred=log_reg.predict(X_testing)
acc = accuracy_score(X_testing_pred,y_testing)

In [192]:
print("train data accuracy : ",acc*100,"%")

train data accuracy :  89.02077151335311 %
