In [2]:
#necessary imports
import os
import pandas as pd

In [3]:
# Loading train data
train_file = "input/train.csv"
df = pd.read_csv(train_file, header=None, names=['class','name','description'])
# Loading test data
test_file = "input/test.csv"
df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])
# Data we have
print("Train:{} Test:{}".format(df.shape,df_test.shape))

Train:(560000, 3) Test:(70000, 3)


In [4]:
df.head()

Unnamed: 0,class,name,description
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [5]:
df_test.head()

Unnamed: 0,class,name,description
0,1,TY KU,TY KU /taɪkuː/ is an American alcoholic bever...
1,1,Odd Lot Entertainment,OddLot Entertainment founded in 2001 by longt...
2,1,Henkel,Henkel AG & Company KGaA operates worldwide w...
3,1,GOAT Store,The GOAT Store (Games Of All Type Store) LLC ...
4,1,RagWing Aircraft Designs,RagWing Aircraft Designs (also called the Rag...


In [6]:
# Since we have no clue about the classes lets build one
# Mapping from class number to class name
class_dict={
            1:'Company',
            2:'EducationalInstitution',
            3:'Artist',
            4:'Athlete',
            5:'OfficeHolder',
            6:'MeanOfTransportation',
            7:'Building',
            8:'NaturalPlace',
            9:'Village',
            10:'Animal',
            11:'Plant',
            12:'Album',
            13:'Film',
            14:'WrittenWork'
        }

# Mapping the classes
df['class_name'] = df['class'].map(class_dict)
df.head()

Unnamed: 0,class,name,description,class_name
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...,Company
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...,Company
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...,Company
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...,Company
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...,Company


In [7]:
df["class_name"].value_counts()

class_name
Company                   40000
EducationalInstitution    40000
Artist                    40000
Athlete                   40000
OfficeHolder              40000
MeanOfTransportation      40000
Building                  40000
NaturalPlace              40000
Village                   40000
Animal                    40000
Plant                     40000
Album                     40000
Film                      40000
WrittenWork               40000
Name: count, dtype: int64

In [9]:
# Lets do some cleaning of this text
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).replace(',',' ').replace('"','').replace('\'',' \' ').replace('.',' . ').replace('(',' ( ').\
            replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()
    
    # normalizing / encoding the text
    if normalize:
        s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')
    
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):
    # Defining the new data
    df = data[['name','description']].copy(deep=True)
    df['class'] = label_prefix + data['class'].astype(str) + ' '
    
    # cleaning it
    if cleanit:
        df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))
        df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))
    
    # shuffling it
    if shuffleit:
        df.sample(frac=1).reset_index(drop=True)
            
    return df

In [10]:
%%time
# Transform the datasets using the above clean functions
df_train_cleaned = clean_df(df, True, True)
df_test_cleaned = clean_df(df_test, True, True)

CPU times: total: 5.22 s
Wall time: 5.86 s


In [11]:
# Write files to disk as fastText classifier API reads files from disk.
train_file =  'final_train.csv'
df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )

test_file = 'final_test.csv'
df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )


Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!

In [13]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
Collecting pybind11>=2.2
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Using legacy 'setup.py install' for fasttext, since package 'wheel' is not installed.
Installing collected packages: pybind11, fasttext
    Running setup.py install for fasttext: started
    Running setup.py install for fasttext: finished with status 'done'
Successfully installed fasttext-0.9.2 pybind11-2.12.0


You should consider upgrading via the 'C:\Users\Shraddha\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [12]:
from fasttext import train_supervised 
"""fastText expects and training file (csv), a model name as input arguments.
label_prefix refers to the prefix before label string in the dataset.
default is __label__. In our dataset, it is __class__. 
There are several other parameters which can be seen in: 
https://pypi.org/project/fasttext/
"""
model = train_supervised(input=train_file, label="__class__", lr=1.0, epoch=5, loss='ova', wordNgrams=2, dim=200, verbose=100)

In [13]:
for k in range(1,6):
    results = model.test(test_file,k=k)
    print(f"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}")

Test Samples: 70000 Precision@1 : 35.9200 Recall@1 : 35.9200
Test Samples: 70000 Precision@2 : 29.7114 Recall@2 : 59.4229
Test Samples: 70000 Precision@3 : 24.0090 Recall@3 : 72.0271
Test Samples: 70000 Precision@4 : 19.0621 Recall@4 : 76.2486
Test Samples: 70000 Precision@5 : 16.5589 Recall@5 : 82.7943


In [14]:
text = " l ' indépendant is a newspaper published in luxembourg from 1945 ."
prediction = model.predict(text)

In [15]:
print(prediction)

(('__class__14',), array([1.00001001]))


Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 93% Precision and Recall are hard numbers to beat, too!