In [None]:
import pandas as pd
import codecs

# Assuming you have a CSV file named 'Splitfiles_by10k.csv'
with codecs.open('Splitfiles_by10k.csv', 'r', encoding='utf-8', errors='ignore') as file:
    data = file.read()

# Now 'data' contains the content of the file, and you can convert it to a DataFrame
df = pd.read_csv(pd.compat.StringIO(data))

In [None]:
df=df.loc[:,['Part of Body','Claim Description']]
df['Claim Description'] = df['Claim Description'].astype('str')
df = df[df['Claim Description']!= '']
df = df[df['Claim Description']!= '*']

In [None]:
x=df['Part of Body'].value_counts()
x_to_drop= x[x>70].index

df = df[df['Part of Body'].isin(x_to_drop)]

In [None]:
df = df[['Claim Description', 'Part of Body']].reset_index()
df = df.drop(df[df['Claim Description'].isna()].index)
df.head()

In [None]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [None]:
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z()]', ' ', df['Claim Description'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
# Creating the Bag of Words model
#from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer(max_features=2500)
#X = cv.fit_transform(corpus).toarray()

# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=3000)
X = tv.fit_transform(corpus).toarray()

df1=X

In [None]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=0.30)
#tfIdfMat_reduced = pca.fit_transform(X.toarray())
#tfIdfMat_reduced

In [None]:
#df1=pd.DataFrame(tfIdfMat_reduced,columns=["PCA"+str(x) for x in range(len(tfIdfMat_reduced[0]))])
#df1['Description']= df['Claim Description']
df1

In [None]:
#y=pd.get_dummies(df['Part of Body'])
#y=y.iloc[:,1].values
y = df['Part of Body'].tolist()
y

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#X = scaler.fit_transform(df1)
#pd.DataFrame(X).head()

In [None]:
# Train Test Split

import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
#pip install xgboost

#POB_detect_model = MultinomialNB().fit(X_train, y_train)-13
POB_detect_model = LogisticRegression().fit(X_train, y_train)
#POB_detect_model = RandomForestClassifier().fit(X_train, y_train)-53
#POB_detect_model = XGBClassifier().fit(X_train, y_train)

In [None]:
#prediction
y_pred_train=POB_detect_model.predict(X_train)
y_pred=POB_detect_model.predict(X_test)

#probability
y_proba= POB_detect_model.predict_proba(X_test)
df['Probability']=pd.DataFrame(y_proba).apply(lambda x: max(x)*100,axis=1)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_train,y_pred_train)
score1=accuracy_score(y_test,y_pred)
print("Train data: ",score)
print("Test data: ",score1)

In [None]:
category_list = df['Part of Body'].unique().astype(str)
print(classification_report(y_test,y_pred))

In [None]:
import pickle
# Save the model to a pickle file
with open('model.pkl', 'wb') as model_file:
    pickle.dump(POB_detect_model, model_file)

In [None]:
import pickle
# Save the model to a pickle file
with open('tfidf.pkl', 'wb') as tfidf_file:
    pickle.dump(tv, tfidf_file)

In [None]:
df