In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv('../input/hepatitis-c-dataset/HepatitisCdata.csv')

In [None]:
data.head(100)

# Cleaning data

In [None]:
data.drop('Unnamed: 0',axis=1,inplace=True)  

Column "unamed" have been removed as its unwanted

In [None]:
data.replace(to_replace=['0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis',
       '2=Fibrosis', '3=Cirrhosis'],
             value=['0','1','2','3','4'],inplace=True)

In [None]:
data['Category'].unique()

In [None]:
data.replace(to_replace=['m','f'],
             value=['0','1'],inplace=True)

          The following have been replaced for convinenece
          0=Blood Donor               Category column
          1=suspect Blood Donor
          2=Hepatitis
          3=Fibrosis
          4=Cirrhosis
          ------------------------------------------------
          0=m(Male)                   sex column
          1=f(Female)

In [None]:
data.head(600)

# EDA

In [None]:
   %matplotlib inline

In [None]:
data.plot(kind='density',subplots=True,layout=(11,5),sharex=False)
plt.show()

In [None]:
data.corr()

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
scatter_matrix(data,figsize=(20,20))
plt.show()

In [None]:
correlations=data.corr()

In [None]:
fig=plt.figure()
ax=fig.add_subplot(111)
cax=ax.matshow(correlations,vmin=-1,vmax=1)
fig.colorbar(cax)
plt.show()

In [None]:
plt.figure()
data.plot(kind='hist')


In [None]:
plt.figure()
sns.distplot(data)

In [None]:
plt.figure()
data.Category.value_counts().plot(kind='pie')

In [None]:
plt.figure()
data.Sex.value_counts().plot(kind='bar')
plt.xlabel("SEX [Male / Female]")
plt.ylabel("Count")
plt.show()


# Null value checking and replacement

In [None]:
data.isnull().sum()

In [None]:
data.shape

we have only 31 null values in the dataset given so we can use mode method to replace the null vaalues in
"ALB","ALP","ALT","CHOL" & "PROT" columns

In [None]:
data['ALP'].replace(np.NaN,data['ALP'].mode()[0],inplace=True)

In [None]:
data['PROT'].replace(np.NaN,data['PROT'].mode()[0],inplace=True)

In [None]:
data['CHOL'].replace(np.NaN,data['CHOL'].mode()[0],inplace=True)

In [None]:
data['ALT'].replace(np.NaN,data['ALT'].mode()[0],inplace=True)

In [None]:
data['ALB'].replace(np.NaN,data['ALB'].mode()[0],inplace=True)

In [None]:
data.isnull().sum() #null values have been replaced

# Building a Prediction Model

In [None]:
x=data.drop(columns='Category',axis=1)
y=data['Category']

In [None]:
x # we will predict category using these attributes

In [None]:
y  # we are going to predict the category 

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, stratify=y,random_state=2)

In [None]:
print(x.shape,x_train.shape,x_test.shape)

In [None]:
model= LogisticRegression()

In [None]:
model.fit(x_train, y_train)

In [None]:
x_train_prediction= model.predict(x_train)
training_data_accuracy=accuracy_score(x_train_prediction,y_train)

In [None]:
print('Accuracy on training data',training_data_accuracy)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
randomforest_classifier = RandomForestClassifier(n_estimators=10)
from sklearn.model_selection import cross_val_score
knn_score=[]
for k in range(1,21):
    knn_classifier =KNeighborsClassifier(n_neighbors=k)
    score=cross_val_score(knn_classifier,x,y,cv=10)
    knn_score.append(score.mean())

In [None]:
score=cross_val_score(randomforest_classifier,x,y,cv=10)
score.mean()

In [None]:
knn_classifier =KNeighborsClassifier (n_neighbors=12)
score2=cross_val_score(knn_classifier,x,y,cv=10)
score2.mean()