In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1 Exploratory Data Analysis

In [2]:
df = pd.read_csv('data\\train.txt')
df.head()

Unnamed: 0,>1|0
0,KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK
1,>2|0
2,FLPAIVGAAAKFLPKIFCAISKKC
3,>3|0
4,FLKWLFKWAKK


In [3]:
df.shape

(999, 1)

In [4]:
df.describe()

Unnamed: 0,>1|0
count,999
unique,999
top,KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK
freq,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   >1|0    999 non-null    object
dtypes: object(1)
memory usage: 7.9+ KB


In [6]:
df.query

<bound method DataFrame.query of                                       >1|0
0    KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK
1                                     >2|0
2                 FLPAIVGAAAKFLPKIFCAISKKC
3                                     >3|0
4                              FLKWLFKWAKK
..                                     ...
994        CGESCVFIPCISTLLGCSCKNKVCYRNGVIP
995                                 >249|1
996                          FLPIVTNLLSGLL
997                                 >250|1
998                      GALRGCWTKSYPPKPCK

[999 rows x 1 columns]>

## 2 Missing values

In [7]:
# display all the columns with missing values
df.isna().sum()

>1|0    0
dtype: int64

## 3 Data Splitting

In [8]:
# split the data in x and y
x = df['>1|0']
y = df['>1|0'].str.split('|',expand=True)
y

Unnamed: 0,0,1
0,KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK,
1,>2,0
2,FLPAIVGAAAKFLPKIFCAISKKC,
3,>3,0
4,FLKWLFKWAKK,
...,...,...
994,CGESCVFIPCISTLLGCSCKNKVCYRNGVIP,
995,>249,1
996,FLPIVTNLLSGLL,
997,>250,1


## 4 Preprocessing

In [9]:
# use regex to remove the > from the data
x = x.str.replace('>','')
x

0      KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK
1                                        2|0
2                   FLPAIVGAAAKFLPKIFCAISKKC
3                                        3|0
4                                FLKWLFKWAKK
                       ...                  
994          CGESCVFIPCISTLLGCSCKNKVCYRNGVIP
995                                    249|1
996                            FLPIVTNLLSGLL
997                                    250|1
998                        GALRGCWTKSYPPKPCK
Name: >1|0, Length: 999, dtype: object

In [10]:
# use regex to remove the | from the data
y = y.apply(lambda x: x.str.replace('|',''))
y

Unnamed: 0,0,1
0,KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK,
1,>2,0
2,FLPAIVGAAAKFLPKIFCAISKKC,
3,>3,0
4,FLKWLFKWAKK,
...,...,...
994,CGESCVFIPCISTLLGCSCKNKVCYRNGVIP,
995,>249,1
996,FLPIVTNLLSGLL,
997,>250,1


In [11]:
# use regex to remove the > from the data
x = x.str.replace('>','')
x

0      KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK
1                                        2|0
2                   FLPAIVGAAAKFLPKIFCAISKKC
3                                        3|0
4                                FLKWLFKWAKK
                       ...                  
994          CGESCVFIPCISTLLGCSCKNKVCYRNGVIP
995                                    249|1
996                            FLPIVTNLLSGLL
997                                    250|1
998                        GALRGCWTKSYPPKPCK
Name: >1|0, Length: 999, dtype: object

In [12]:
# display the missing va
df.isnull().sum()

>1|0    0
dtype: int64

In [13]:
df.shape

(999, 1)

## 5 Creating a model

In [14]:
# train the model using sequence
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

x = vectorizer.fit_transform(x)

In [15]:
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Example data
#documents = ['This is a sample document.', 'Another example document.', 'A third document here.']
#labels = [0, 1, 0]  # Example labels

In [16]:
# Create a pipeline that transforms text to TF-IDF features and trains a MultinomialNB model
#text_clf = Pipeline([
#    ('tfidf', TfidfVectorizer()),
#    ('clf', MultinomialNB()),
#])

In [17]:
# Train the model
#text_clf.fit(documents, labels)

In [18]:
# Now you can predict on new data
#new_documents = ['This is a new document.']
#predicted_labels = text_clf.predict(new_documents)
#print(predicted_labels)

## Testing data

In [19]:
test = pd.read_csv("data\\test.text")
test

Unnamed: 0,ILGPVISTIGGVLGGLLKNL
0,DLRFLYPRGKLPVPTLPPFNPKPIYIDMGNRY
1,PDEDAINNALNKVCSTGRRQRSICKQLLKK
2,FAKIIAKIAKIAKKIL
3,EPHPDEFVGLM
4,GIPCGESCVFIPCLTSAIDCSCKSKVCYRN
...,...
158,DSHAKRHHGYKRKFHEKHHSHRGYRSNYLYDN
159,HGVSGHGQHGVHG
160,AWKKWAKAWKWAKAKWWAKAA
161,FLRFIGSVIHGIGHLVHHIGVAL
