# Clean Data
---

## 1. Import Modules

In [70]:
import numpy as np
import pandas as pd
from iso639 import find

## 2. Import Data

In [19]:
df = pd.read_csv('data.csv')

## 3. EDA

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 164 entries, 1 to 164
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   empire/dynasty   164 non-null    object
 1   established      164 non-null    object
 2   disestablished   164 non-null    object
 3   capital(s)       164 non-null    object
 4   language(s)      164 non-null    object
 5   present part of  164 non-null    object
dtypes: object(6)
memory usage: 9.0+ KB


In [86]:
df.isnull().sum()

empire/dynasty     0
established        0
disestablished     0
capital(s)         0
language(s)        0
present part of    0
dtype: int64

In [20]:
df.head()

Unnamed: 0,empire/dynasty,established,disestablished,capital(s),language(s),present part of
0,,,,,,
1,"Brihadratha dynasty,\n",1700 BCE\n,682 BCE\n,"Girivraja,\n","Vedic Sanskrit,\n",",India,\n"
2,"Gopala Dynasty,\n",1400 BCE\n,841 BCE\n,"Pashupatinath Temple,, ,Gaushala,, ,Kathmandu,\n","Sanskrit,\n",",Nepal,\n"
3,"Pundravardhana Kingdom,\n",1280 BCE\n,345 BCE\n,"Mahasthangarh,\n","Pali,, ,Sanskrit,\n",",Bangladesh, ,India,\n"
4,"Kuru Kingdom,\n",1200 BCE\n,345 BCE\n,"Āsandīvat,, ,Hastinapura,, ,Indraprastha,\n","Sanskrit,\n",",India,\n"


## 4. Cleaning

---

### 4a. Drop Null Values

In [21]:
df.dropna(axis=0, inplace=True)

### 4b. Clean string values

Remove the "\n" instance from the string

In [22]:
df = df.applymap(lambda x: x.strip().replace("\n","") if isinstance(x, str) else x)

### 4c. Clean the country feature

In [34]:
df['present part of'] = df['present part of'].apply(lambda x: str(GeoText(x).countries))

In [49]:
df['present part of']

1                                      ['India']
2                                      ['Nepal']
3                        ['Bangladesh', 'India']
4                                      ['India']
5                                      ['India']
                         ...                    
160                        ['India', 'Pakistan']
161                                    ['India']
162                                    ['India']
163    ['Nepal', 'India', 'China', 'Bangladesh']
164               ['India', 'Pakistan', 'China']
Name: present part of, Length: 164, dtype: object

### 4d. Clean the languages feature

In [None]:
df['language(s)'].apply(lambda x: [find(i) for i in x.split(',') if isinstance(i, str)])