### The goal of this notebook is to showcase a cheap, fast manner to handle  text classification tasks without any fancy hardware. 

#### PS: I am using my Macbook Air. 

#### Step 1: 
    
    Transform the text using TF-IDF feature extracter by using an n_gram range betwen 1 and 2 and a word analyzer.
    
#### Step 2: 
    
    Reduce the TF-IDF vectors using Truncated SVD by capturing the maximum level of variance. The go to method for    sparse matrices
    
#### Step 3: 
    
    Run a logistic regression Model on the newly truncated vectors. 
    


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
train = pd.read_csv("train_lang.csv")
valid = pd.read_csv("valid_lang.csv")
test = pd.read_csv("test_lang.csv")

In [3]:
count = TfidfVectorizer(ngram_range=(1,2), analyzer='char')
train_count = count.fit_transform(train["text"].values)
valid_count = count.transform(valid["text"].values)
test_count = count.transform(test["text"].values)

In [4]:
train_count.shape

(84000, 4639)

In [5]:
svd = TruncatedSVD(n_components = 400, algorithm = "arpack" )
train_svd = svd.fit_transform(train_count)
valid_svd = svd.transform(valid_count)
test_svd = svd.transform(test_count)

In [6]:
svd.explained_variance_ratio_.sum()

0.8112106132244827

In [7]:
lr = LogisticRegression()
lr.fit(train_svd, train["lang"])
valid_preds = lr.predict(valid_svd)
print(classification_report(valid["lang"], valid_preds))



             precision    recall  f1-score   support

         bg       1.00      1.00      1.00      1000
         cs       1.00      0.98      0.99      1000
         da       0.99      0.99      0.99      1000
         de       1.00      1.00      1.00      1000
         el       1.00      1.00      1.00      1000
         en       1.00      1.00      1.00      1000
         es       0.99      0.98      0.99      1000
         et       0.99      0.99      0.99      1000
         fi       1.00      0.99      1.00      1000
         fr       0.99      0.99      0.99      1000
         hu       1.00      1.00      1.00      1000
         it       0.99      0.99      0.99      1000
         lt       0.99      1.00      1.00      1000
         lv       1.00      1.00      1.00      1000
         nl       0.99      1.00      0.99      1000
         pl       1.00      1.00      1.00      1000
         pt       0.99      0.99      0.99      1000
         ro       0.99      1.00      1.00   

In [8]:
print(classification_report(test["lang"], lr.predict(test_svd)))



             precision    recall  f1-score   support

         bg       1.00      1.00      1.00      1000
         cs       0.99      0.99      0.99      1000
         da       0.99      0.99      0.99       999
         de       0.99      0.99      0.99       999
         el       1.00      1.00      1.00      1000
         en       0.98      1.00      0.99      1000
         es       0.99      0.98      0.99       999
         et       0.99      0.99      0.99       999
         fi       1.00      0.99      0.99       998
         fr       0.99      0.99      0.99       999
         hu       1.00      1.00      1.00      1000
         it       0.99      1.00      0.99       998
         lt       1.00      1.00      1.00      1000
         lv       1.00      1.00      1.00      1000
         nl       1.00      0.98      0.99      1000
         pl       1.00      1.00      1.00      1000
         pt       0.99      0.99      0.99      1000
         ro       1.00      1.00      1.00   