# E.coli Protein Sequence

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [6]:
df=pd.read_csv("ecoli.csv")
df

Unnamed: 0,SEQUENCE_NAME,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...,...
331,TREA_ECOLI,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,UGPB_ECOLI,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,USHA_ECOLI,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,XYLF_ECOLI,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SEQUENCE_NAME  336 non-null    object 
 1   MCG            336 non-null    float64
 2   GVH            336 non-null    float64
 3   LIP            336 non-null    float64
 4   CHG            336 non-null    float64
 5   AAC            336 non-null    float64
 6   ALM1           336 non-null    float64
 7   ALM2           336 non-null    float64
 8   SITE           336 non-null    object 
dtypes: float64(7), object(2)
memory usage: 23.8+ KB


In [8]:
print("Number of null values:", df.isnull().sum().sum())

Number of null values: 0


In [9]:
{column:len(df[column].unique())
for column in df.columns}

{'SEQUENCE_NAME': 336,
 'MCG': 78,
 'GVH': 63,
 'LIP': 2,
 'CHG': 2,
 'AAC': 59,
 'ALM1': 82,
 'ALM2': 77,
 'SITE': 8}

In [10]:
df=df.drop(['SEQUENCE_NAME'],axis=1)
df


Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [11]:
df["SITE"]=df["SITE"].apply(lambda x: x if x=='cp' else"others")
df

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,others
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,others
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,others
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,others


In [12]:
y=df["SITE"]
X=df.drop(["SITE"],axis=1)
X

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35
...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37


In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,shuffle=True,random_state=123)

In [14]:
X_train

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2
216,0.30,0.51,0.48,0.5,0.42,0.61,0.34
298,0.64,0.72,0.48,0.5,0.49,0.42,0.19
220,0.85,0.53,0.48,0.5,0.53,0.52,0.35
246,0.76,0.41,0.48,0.5,0.50,0.59,0.62
24,0.36,0.39,0.48,0.5,0.48,0.22,0.23
...,...,...,...,...,...,...,...
83,0.34,0.55,0.48,0.5,0.58,0.31,0.41
17,0.44,0.27,0.48,0.5,0.55,0.52,0.58
230,0.75,0.37,0.48,0.5,0.64,0.70,0.74
98,0.65,0.55,0.48,0.5,0.34,0.37,0.28


In [15]:
X_train.describe()

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2
count,235.0,235.0,235.0,235.0,235.0,235.0,235.0
mean,0.506128,0.505362,0.495489,0.502128,0.504553,0.491745,0.488553
std,0.198651,0.156591,0.088589,0.032616,0.12804,0.20938,0.204349
min,0.0,0.16,0.48,0.5,0.0,0.03,0.01
25%,0.34,0.395,0.48,0.5,0.42,0.33,0.34
50%,0.51,0.48,0.48,0.5,0.5,0.45,0.43
75%,0.67,0.57,0.48,0.5,0.57,0.67,0.68
max,0.89,1.0,1.0,1.0,0.88,0.94,0.94


In [16]:
y_train

216    others
298    others
220    others
246    others
24         cp
        ...  
83         cp
17         cp
230    others
98         cp
322    others
Name: SITE, Length: 235, dtype: object

In [17]:
model=LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9900990099009901

In [18]:
print(model.score(X_test,y_test)*100)

99.00990099009901
