## Load the necessary libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import pandas as pd
import fasttext

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dbpedia-classes/DBP_wiki_data.csv
/kaggle/input/dbpedia-classes/DBPEDIA_test.csv
/kaggle/input/dbpedia-classes/DBPEDIA_val.csv
/kaggle/input/dbpedia-classes/DBPEDIA_train.csv


## Read the data file

In [2]:
# Loading train data
train_file = '/kaggle/input/dbpedia-classes/DBPEDIA_train.csv'
df = pd.read_csv(train_file)
df.rename(columns = {'text':'Texts','l1':'class', 'l2':'profession','l3':'Type'}, inplace = True)

# Loading test data
test_file = '/kaggle/input/dbpedia-classes/DBPEDIA_test.csv'
df_test = pd.read_csv(test_file)
df_test.rename(columns = {'text':'Texts','l1':'class', 'l2':'profession','l3':'Type'}, inplace = True)

# Data we have
print("Train:{}  Test:{}".format(df.shape, df_test.shape))

Train:(240942, 4)  Test:(60794, 4)


In [3]:
df.head()

Unnamed: 0,Texts,class,profession,Type
0,"William Alexander Massey (October 7, 1856 – Ma...",Agent,Politician,Senator
1,Lions is the sixth studio album by American ro...,Work,MusicalWork,Album
2,"Pirqa (Aymara and Quechua for wall, hispaniciz...",Place,NaturalPlace,Mountain
3,Cancer Prevention Research is a biweekly peer-...,Work,PeriodicalLiterature,AcademicJournal
4,The Princeton University Chapel is located on ...,Place,Building,HistoricBuilding


## Preprocessing

In [4]:
df['class'].value_counts()

Agent             124798
Place              45877
Species            21472
Work               21013
Event              19106
SportsSeason        5883
UnitOfWork          1761
TopicalConcept       784
Device               248
Name: class, dtype: int64

In [5]:
df['profession'].unique()

array(['Politician', 'MusicalWork', 'NaturalPlace',
       'PeriodicalLiterature', 'Building', 'Animal', 'Organisation',
       'Person', 'Athlete', 'Settlement', 'LegalCase', 'MotorcycleRider',
       'Company', 'RouteOfTransportation', 'SocietalEvent',
       'WinterSportPlayer', 'ClericalAdministrativeRegion',
       'EducationalInstitution', 'BodyOfWater', 'Plant', 'Infrastructure',
       'FootballLeagueSeason', 'Actor', 'SportsManager', 'Cleric',
       'Boxer', 'Cartoon', 'Venue', 'Artist', 'Tournament', 'Coach',
       'ComicsCharacter', 'Olympics', 'SportsTeamSeason', 'Software',
       'Group', 'Broadcaster', 'Tower', 'Race', 'SportFacility',
       'SportsTeam', 'SportsEvent', 'Eukaryote', 'Scientist',
       'CelestialBody', 'Engine', 'BritishRoyalty', 'Satellite', 'Comic',
       'WrittenWork', 'FictionalCharacter', 'Presenter', 'Horse',
       'NaturalEvent', 'AmusementParkAttraction', 'Writer', 'Song',
       'RaceTrack', 'SportsLeague', 'Genre', 'GridironFootballPlayer'

In [6]:
df['Type'].value_counts()

AcademicJournal          1924
Manga                    1924
FigureSkater             1924
OlympicEvent             1923
SoccerTournament         1922
                         ... 
Cycad                     145
AnimangaCharacter         144
BeachVolleyballPlayer     137
CanadianFootballTeam      133
BiologicalDatabase        129
Name: Type, Length: 219, dtype: int64

In [7]:
professions = {}
i = 0
for Name in df['profession'].unique():
    professions[Name] = i
    i +=1
    
# Mapping the classes
df['Class'] = df['profession'].map(professions)
df_test['Class'] = df_test['profession'].map(professions)

In [8]:
df.drop('class', axis = 1, inplace = True)
df_test.drop('class', axis = 1, inplace = True)
df.head()

Unnamed: 0,Texts,profession,Type,Class
0,"William Alexander Massey (October 7, 1856 – Ma...",Politician,Senator,0
1,Lions is the sixth studio album by American ro...,MusicalWork,Album,1
2,"Pirqa (Aymara and Quechua for wall, hispaniciz...",NaturalPlace,Mountain,2
3,Cancer Prevention Research is a biweekly peer-...,PeriodicalLiterature,AcademicJournal,3
4,The Princeton University Chapel is located on ...,Building,HistoricBuilding,4


## Text Cleaning

In [9]:
# Lets do some cleaning of this text
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).replace(',',' ').replace('"','').replace('\' ',' \' ').replace('.',' . ').replace('(',' ( ').\
            replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()
    
    # normalizing / encoding the text
    if normalize:
        s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')
    
    return s

In [10]:
'__class__' + df['Class'].iloc[0].astype(str) + ' '

'__class__0 '

In [11]:
# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):
    # Defining the new data
    df = data[['Type','Texts']].copy(deep=True)
    df['Class'] = label_prefix + data['Class'].astype(str) + ' '
    
    # cleaning it
    if cleanit:
        df['Type'] = df['Type'].apply(lambda x: clean_it(x,encodeit))
        df['Texts'] = df['Texts'].apply(lambda x: clean_it(x,encodeit))
    
    # shuffling it
    if shuffleit:
        df.sample(frac=1).reset_index(drop=True)
            
    return df

In [12]:
%%time
# Transform the datasets using the above clean functions
df_train_cleaned = clean_df(df, True, True)
df_test_cleaned = clean_df(df_test, True, True)

CPU times: user 4.1 s, sys: 345 ms, total: 4.45 s
Wall time: 4.45 s


In [13]:
df_train_cleaned.head()

Unnamed: 0,Type,Texts,Class
0,senator,william alexander massey ( october 7 1856 – ...,__class__0
1,album,lions is the sixth studio album by american ro...,__class__1
2,mountain,pirqa ( aymara and quechua for wall hispanic...,__class__2
3,academicjournal,cancer prevention research is a biweekly peer-...,__class__3
4,historicbuilding,the princeton university chapel is located on ...,__class__4


#### Write files to disk as fastText classifier API reads files from disk.

In [14]:
# Write files to disk as fastText classifier API reads files from disk.
train_file = 'dbpedia_train.csv'
df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['Class','Type','Texts'])


test_file = 'dbpedia_test.csv'
df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['Class','Type','Texts'])

## FastText
Train and test files written into disk in a format fastText wants, we are ready to use it for text classification!

In [15]:
%%time
## Using fastText for feature extraction and training
from fasttext import train_supervised 
"""
fastText expects and training file (csv), a model name as input arguments.
label_prefix refers to the prefix before label string in the dataset.
default is __label__. In our dataset, it is __class__. 
There are several other parameters which can be seen in: 
https://pypi.org/project/fasttext/
"""
model = train_supervised(input=train_file, label="__class__", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)

Read 27M words
Number of words:  753758
Number of labels: 70
Progress: 100.0% words/sec/thread:  871758 lr:  0.000085 avg.loss:  0.022439 ETA:   0h 0m 0s

CPU times: user 39min 23s, sys: 15.8 s, total: 39min 38s
Wall time: 19min 51s


Progress: 100.0% words/sec/thread:  871758 lr:  0.000000 avg.loss:  0.022437 ETA:   0h 0m 0s


In [16]:
# Save the model
model.save_model("FastText.bin")

# Load the model
FastText = fasttext.load_model("FastText.bin")



In [17]:
print('Number of words :',len(FastText.words))
print('Label :',len(FastText.labels))

Number of words : 753758
Label : 70


## Evaluation

In [18]:
for k in range(1,6):
    results = FastText.test(test_file,k=k)
    print(f"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}")

Test Samples: 60794 Precision@1 : 96.6888 Recall@1 : 96.6888
Test Samples: 60794 Precision@2 : 49.1998 Recall@2 : 98.3995
Test Samples: 60794 Precision@3 : 32.8552 Recall@3 : 98.5656
Test Samples: 60794 Precision@4 : 24.6484 Recall@4 : 98.5936
Test Samples: 60794 Precision@5 : 19.8158 Recall@5 : 99.0789


## Prediction

In [19]:
print('Text :',df_test_cleaned['Texts'].iloc[0])
print('Actual :',df_test_cleaned['Class'].iloc[0])
print('Prediction :',FastText.predict(df_test_cleaned['Texts'].iloc[0]))

Text : liu chao-shiuan  ( chinese  劉兆玄  pinyin  liú zhàoxuán  born may 10  1943 )  is a taiwanese educator and politician .  he is a former president of the national tsing hua university  ( 1987–1993 )  and soochow university  ( 2004–2008 )  and a former premier of the republic of china  ( 2008–2009 )  . 
Actual : __class__0 
Prediction : (('__class__0',), array([1.00001001]))
