In [56]:
import pandas as pd
import numpy as np

In [57]:
import matplotlib.pyplot as plt
import seaborn as sns

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [59]:
df=pd.read_csv('ecoli.csv')
df

Unnamed: 0,SEQUENCE_NAME,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...,...
331,TREA_ECOLI,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,UGPB_ECOLI,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,USHA_ECOLI,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,XYLF_ECOLI,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SEQUENCE_NAME  336 non-null    object 
 1   MCG            336 non-null    float64
 2   GVH            336 non-null    float64
 3   LIP            336 non-null    float64
 4   CHG            336 non-null    float64
 5   AAC            336 non-null    float64
 6   ALM1           336 non-null    float64
 7   ALM2           336 non-null    float64
 8   SITE           336 non-null    object 
dtypes: float64(7), object(2)
memory usage: 23.8+ KB


In [61]:
print("Number of null values :" , df.isnull().sum().sum())

Number of null values : 0


In [62]:
df.describe(include='all')

Unnamed: 0,SEQUENCE_NAME,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
count,336,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336
unique,336,,,,,,,,8
top,AAT_ECOLI,,,,,,,,cp
freq,1,,,,,,,,143
mean,,0.50006,0.5,0.495476,0.501488,0.50003,0.500179,0.499732,
std,,0.194634,0.148157,0.088495,0.027277,0.122376,0.215751,0.209411,
min,,0.0,0.16,0.48,0.5,0.0,0.03,0.0,
25%,,0.34,0.4,0.48,0.5,0.42,0.33,0.35,
50%,,0.5,0.47,0.48,0.5,0.495,0.455,0.43,
75%,,0.6625,0.57,0.48,0.5,0.57,0.71,0.71,


In [63]:
{ column: len(df[column].unique())for column in df.columns}

{'SEQUENCE_NAME': 336,
 'MCG': 78,
 'GVH': 63,
 'LIP': 2,
 'CHG': 2,
 'AAC': 59,
 'ALM1': 82,
 'ALM2': 77,
 'SITE': 8}

In [64]:
print (df.nunique())

SEQUENCE_NAME    336
MCG               78
GVH               63
LIP                2
CHG                2
AAC               59
ALM1              82
ALM2              77
SITE               8
dtype: int64


In [65]:
df=df.drop(["SEQUENCE_NAME"],axis=1)
df

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [66]:
df["SITE"]=df["SITE"].apply(lambda x: x if x=='cp' else"others")
df

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,others
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,others
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,others
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,others


In [67]:
X=df.drop(["SITE"],axis=1)
y=df["SITE"]
X
    

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35
...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37


In [68]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,shuffle=True,random_state=2)
X_train

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2
288,0.65,0.57,0.48,0.5,0.47,0.47,0.51
54,0.47,0.47,0.48,0.5,0.22,0.16,0.26
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35
141,0.27,0.42,0.48,0.5,0.37,0.38,0.43
104,0.24,0.34,0.48,0.5,0.38,0.30,0.40
...,...,...,...,...,...,...,...
299,0.68,0.82,0.48,0.5,0.38,0.65,0.56
22,0.51,0.54,0.48,0.5,0.41,0.34,0.43
72,0.37,0.50,0.48,0.5,0.42,0.36,0.45
15,0.25,0.40,0.48,0.5,0.46,0.44,0.52


In [69]:
y_test

55         cp
67         cp
161    others
217    others
66         cp
        ...  
228    others
142        cp
309    others
264    others
282    others
Name: SITE, Length: 101, dtype: object

In [77]:
model=LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)
print(model.score(X_test,y_test)*100)

94.05940594059405
