In [100]:
import pandas as pd
import numpy as np
import re #usful for searching words in pharagraph
from nltk.corpus import stopwords  #words doesn't add much value to phara (rticals)
from nltk.stem.porter import PorterStemmer #gives a root word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer #convert text in to feature vectors
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [101]:
df=pd.read_excel('dataset1.xlsx')

In [102]:
df.head()

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference,label
0,H,61004060,,Export AF - Correction for AI-DN-20210615,Freight inwards charges - Air,This account is used to record the freight cha...,AF PAYMENT,0
1,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018288,0
2,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017939,0
3,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018434,0
4,H,34001010,304285.0,Import Charges - AIR,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017842,1


In [103]:
len(df['item_text'].unique())

774

In [104]:
df.columns

Index(['dc_indicator', 'gl', 'vendor', 'item_text', 'gl_acc_name',
       'gl_acc_description', 'document_reference', 'label'],
      dtype='object')

In [105]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saranga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [106]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [107]:
df.shape

(5000, 8)

In [108]:
df.isnull().sum()

dc_indicator          0
gl                    0
vendor                0
item_text             0
gl_acc_name           0
gl_acc_description    0
document_reference    0
label                 0
dtype: int64

In [109]:
df.dtypes

dc_indicator          object
gl                     int64
vendor                object
item_text             object
gl_acc_name           object
gl_acc_description    object
document_reference    object
label                  int64
dtype: object

In [110]:
# df["messages"] = df["item_text"].astype(str) + " " + df["gl_acc_name"] + " " + df["gl_acc_description"]

In [111]:
# df["messages"]

In [112]:
#seperate the data and label
X=df.drop(columns="label",axis=1) #axis=1 "removing a column"
y=df["label"]

In [113]:
X.head()

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference
0,H,61004060,,Export AF - Correction for AI-DN-20210615,Freight inwards charges - Air,This account is used to record the freight cha...,AF PAYMENT
1,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018288
2,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017939
3,H,61004060,,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...,KATF018434
4,H,34001010,304285.0,Import Charges - AIR,Freight inwards charges - Air,This account is used to record the freight cha...,KATF017842


In [114]:
y

0       0
1       0
2       0
3       0
4       1
       ..
4995    1
4996    1
4997    1
4998    1
4999    1
Name: label, Length: 5000, dtype: int64

In [115]:
port_stem=PorterStemmer()

In [116]:
def stemming(content):
    stemmed_content=re.sub("[^a-zA-z]"," ",content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")]
    stemmed_content= " ".join(stemmed_content)
    return stemmed_content

In [117]:
X["item_text"]=X["item_text"].apply(stemming)
X["gl_acc_name"]=X["gl_acc_name"].apply(stemming)
X["gl_acc_description"]=X["gl_acc_description"].apply(stemming)

In [118]:
X.head()

Unnamed: 0,dc_indicator,gl,vendor,item_text,gl_acc_name,gl_acc_description,document_reference
0,H,61004060,,export af correct ai dn,freight inward charg air,account use record freight charg incur import ...,AF PAYMENT
1,H,61004060,,credit note,freight inward charg air,account use record freight charg incur import ...,KATF018288
2,H,61004060,,credit note,freight inward charg air,account use record freight charg incur import ...,KATF017939
3,H,61004060,,credit note,freight inward charg air,account use record freight charg incur import ...,KATF018434
4,H,34001010,304285.0,import charg air,freight inward charg air,account use record freight charg incur import ...,KATF017842


In [119]:
X=df[["item_text","gl_acc_name","gl_acc_description"]]
y=df["label"].values

In [120]:
X

Unnamed: 0,item_text,gl_acc_name,gl_acc_description
0,Export AF - Correction for AI-DN-20210615,Freight inwards charges - Air,This account is used to record the freight cha...
1,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...
2,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...
3,Credit Note,Freight inwards charges - Air,This account is used to record the freight cha...
4,Import Charges - AIR,Freight inwards charges - Air,This account is used to record the freight cha...
...,...,...,...
4995,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...
4996,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...
4997,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...
4998,FEIGTH CHARGES- AIR,Freight inwards charges - Air,This account is used to record the freight cha...


In [121]:
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [122]:
sample = X.apply(lambda col: col.str.strip())

In [124]:
#converting data to numerical data

vectorizer=TfidfVectorizer(max_features=50 , ngram_range=(1,3))
X=sample.apply(lambda col: vectorizer.fit_transform(col))

In [125]:
X.shape

(3,)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

In [None]:
log_reg=LogisticRegression()

In [None]:
log_reg.fit(X_train,y_train)

In [None]:
#accuracy on train data

X_train_pred=log_reg.predict(X_train)
acc = accuracy_score(X_train_pred,y_train)

In [None]:
print("train data accuracy : ",acc*100,"%")

In [None]:
#accuracy on test data

X_test_pred=log_reg.predict(X_test)
acc=accuracy_score(X_test_pred,y_test)

In [None]:
print("train data accuracy : ",acc*100,"%")

In [None]:
log_reg.predict(X_test[:30])

In [None]:
print(y_test[:30])

In [None]:
#### trying bring up a test file

df_test=pd.read_excel('dataset1_test.xlsx')

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
df_test.isnull().sum()

In [None]:
df_test["messages_test"] = df_test["item_text"].astype(str) + " " + df_test["gl_acc_name"] + " " + df_test["gl_acc_description"]

In [None]:
df_test["messages_test"]

In [None]:
#seperate the data and label
X_testing =df.drop(columns="label",axis=1) #axis=1 "removing a column"

In [None]:
X_testing

In [None]:
df_test["messages_test"] = df_test["messages_test"].apply(stemming)

In [None]:
print(df_test["messages_test"])

In [None]:
X_testing=df_test["messages_test"].values
y_testing=df_test["label"].values

In [None]:
#converting data to numerical data

vectorizer=TfidfVectorizer(max_features=50 , ngram_range=(1,3))
X_testing=vectorizer.fit_transform(X_testing)

In [None]:
print(X_testing)

In [None]:
X_testing.shape

In [None]:
#accuracy on external data

X_testing_pred=log_reg.predict(X_testing)
acc = accuracy_score(X_testing_pred,y_testing)

In [None]:
print("train data accuracy : ",acc*100,"%")