In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import re

In [2]:
data = pd.read_csv('data/Language Detection.csv')
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [3]:
class Remove(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self
    
    def transform(self, X):
        text_list = []
        for _,row in X.iterrows():
            text = row['Text']
            text = re.sub(r'[!@#$(),n%^&*?:;~`0-9]]', ' ', text)
            text = text.replace("[", "")
            text = text.replace("]", "")
            text = text.lower()
            text_list.append(text)
        return text_list


class Vectorizer(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self
    
    def transform(self, X):
        cv = CountVectorizer()
        bow = cv.fit_transform(X).toarray()
        return bow

class ToArray(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self
    def transform(self, X):
        arr = np.array(X)
        return arr

In [4]:
train_data, test_data = train_test_split(data, test_size=0.2)

In [5]:
target_column = 'Language'
X_train = train_data.drop([target_column], axis=1)
y_train = train_data[target_column]
X_test = test_data.drop([target_column], axis=1)
y_test = test_data[target_column]

In [6]:
print(type(X_train))
print(y_train)
print(type(X_test))
print(y_test)

<class 'pandas.core.frame.DataFrame'>
9426      Arabic
2305       Tamil
10323    Kannada
2094       Tamil
918      English
          ...   
7541     Italian
7419     Italian
10041    Kannada
5241     Spanish
2344       Tamil
Name: Language, Length: 8269, dtype: object
<class 'pandas.core.frame.DataFrame'>
2459         Tamil
1074       English
8021       Turkish
1704     Malayalam
10218      Kannada
           ...    
549        English
4060        French
8958      Sweedish
9160        Arabic
3289        French
Name: Language, Length: 2068, dtype: object


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

pipeline = Pipeline([
    ("remover", Remove()),
    ("vectorizer", Vectorizer())
])

In [8]:
from sklearn import set_config
set_config(display="diagram")
pipeline

In [9]:
X_train.head()

Unnamed: 0,Text
9426,يقدم عروض.
2305,"யாராவது அழுகிறார்கள், அதை விரும்பவில்லை என்றால்."
10323,ಇಲ್ಲಿ ಕೆಲವು ಚಿನ್ನದ ಆಲೂಗಡ್ಡೆಗಳನ್ನು ಪ್ರಯತ್ನಿಸಿ ಅ...
2094,—மீக்கா ரியோக்காசு[54] ஏனைய பாரம்பரியக் கலைக்க...
918,"Based on the concept of strong rules, Rakesh A..."


In [10]:
x_train_arr = pipeline.fit_transform(X_train)
x_test_arr = pipeline.transform(X_test)
print(x_train_arr.shape)
print(x_test_arr.shape)

(8269, 34745)
(2068, 14304)


In [11]:
x_train_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [53]:
data_list = []
for _,row in X_train.iterrows():
    text = row['Text']
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', '', text)
    text = text.replace("[","")
    text = text.replace("]", "")
    text = text.lower()
    data_list.append(text)

In [54]:
print(data_list)

