In [1]:
import pandas as pd 
import numpy as np 
import glob
import matplotlib.pyplot as plt
import seaborn as sns

Clean all the files and combine them into a single csv file as domestic_visitors and foreign_visitors

In [2]:
domestic_visitors_2016 = pd.read_csv('domestic_visitors/domestic_visitors_2016.csv')
domestic_visitors_2016.head(30)

Unnamed: 0,district,date,month,year,visitors
0,Adilabad,01-01-2016,January,2016,792136.0
1,Adilabad,01-02-2016,February,2016,937820.0
2,Adilabad,01-03-2016,March,2016,582946.0
3,Adilabad,01-04-2016,April,2016,341948.0
4,Adilabad,01-05-2016,May,2016,252887.0
5,Adilabad,01-06-2016,June,2016,368237.0
6,Adilabad,01-07-2016,July,2016,447562.0
7,Adilabad,01-08-2016,August,2016,614285.0
8,Adilabad,01-09-2016,September,2016,491279.0
9,Adilabad,01-10-2016,October,2016,94184.0


In [3]:
domestic_visitors_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   district  372 non-null    object
 1   date      372 non-null    object
 2   month     372 non-null    object
 3   year      372 non-null    int64 
 4   visitors  372 non-null    object
dtypes: int64(1), object(4)
memory usage: 14.7+ KB


In [4]:
domestic_visitors_2016.describe()

Unnamed: 0,year
count,372.0
mean,2016.0
std,0.0
min,2016.0
25%,2016.0
50%,2016.0
75%,2016.0
max,2016.0


In [5]:
domestic_visitors_2016[domestic_visitors_2016['visitors'] == ' '].count()

district    207
date        207
month       207
year        207
visitors    207
dtype: int64

It will show isnull and isna zero but there are empty cell values have value as ' ' 

In [6]:
domestic_visitors_2016.isnull().sum()

district    0
date        0
month       0
year        0
visitors    0
dtype: int64

In [7]:
domestic_visitors_2017 = pd.read_csv('domestic_visitors/domestic_visitors_2017.csv')
domestic_visitors_2017.head()
domestic_visitors_2017.isnull().sum()
domestic_visitors_2017.isna().sum()
domestic_visitors_2017[domestic_visitors_2017['visitors'] == ' '].count()

district    36
date        36
month       36
year        36
visitors    36
dtype: int64

In [8]:
domestic_visitors_2018 = pd.read_csv('domestic_visitors/domestic_visitors_2018.csv')
domestic_visitors_2018.head()
domestic_visitors_2018.isnull().sum()
domestic_visitors_2018.isna().sum()


district     0
date         0
month        0
year         0
visitors    12
dtype: int64

To find the null values using is null.
select those rows and use fillna() remove or replace null values

In [9]:
null_mask = domestic_visitors_2018.isnull().any(axis = 1)
null_mask
domestic_visitors_2018[null_mask]
domestic_visitors_2018= domestic_visitors_2018.fillna(' ')
domestic_visitors_2018.isna().sum()

district    0
date        0
month       0
year        0
visitors    0
dtype: int64

In [10]:
domestic_visitors_2018[domestic_visitors_2018['visitors'] == ' '].count()

district    48
date        48
month       48
year        48
visitors    48
dtype: int64

Merge all CSV files into one csv

In [11]:
# To combine all csv files from 2016 to 2019 into one file
file_paths = glob.glob("domestic_visitors/domestic_visitors_*.csv")
df_list = [pd.read_csv(file) for file in file_paths]
domestic_visitors = pd.concat(df_list, ignore_index = True)

# replace any null or empty values with zero
domestic_visitors.replace(" ", 0, inplace= True)
domestic_visitors.fillna(0, inplace = True)

#save it as csv file
domestic_visitors.to_csv('domestic_visitors.csv', index = False)

print(domestic_visitors.info())
print(domestic_visitors.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1512 entries, 0 to 1511
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   district  1512 non-null   object
 1   date      1512 non-null   object
 2   month     1512 non-null   object
 3   year      1512 non-null   int64 
 4   visitors  1512 non-null   object
dtypes: int64(1), object(4)
memory usage: 59.2+ KB
None
   district        date     month  year visitors
0  Adilabad  01-01-2016   January  2016   792136
1  Adilabad  01-02-2016  February  2016   937820
2  Adilabad  01-03-2016     March  2016   582946
3  Adilabad  01-04-2016     April  2016   341948
4  Adilabad  01-05-2016       May  2016   252887


FOREGIN VISITORS DATASETS CLEANING AND MERGING

In [12]:
foreign_visitors_2016 = pd.read_csv('foreign_visitors/foreign_visitors_2016.csv')
foreign_visitors_2016.head()
foreign_visitors_2016.isna().sum()


district    0
date        0
month       0
year        0
visitors    0
dtype: int64

In [13]:
foreign_visitors_2017 = pd.read_csv('foreign_visitors/foreign_visitors_2017.csv')
foreign_visitors_2017.head()
foreign_visitors_2017.isna().sum()

district    0
date        0
month       0
year        0
visitors    0
dtype: int64

In [14]:
foreign_visitors_2018 = pd.read_csv('foreign_visitors/foreign_visitors_2018.csv')
foreign_visitors_2018.head()
foreign_visitors_2018.isnull().sum()

district    0
date        0
month       0
year        0
visitors    0
dtype: int64

In [15]:
foreign_visitors_2019 = pd.read_csv('foreign_visitors/foreign_visitors_2019.csv')
foreign_visitors_2019.head()
foreign_visitors_2019.isna().sum()



district    0
date        0
month       0
year        0
visitors    0
dtype: int64

In [16]:
file_paths = glob.glob("foreign_visitors/foreign_visitors_*.csv")
df_list = [pd.read_csv(file) for file in file_paths]
foreign_visitors = pd.concat(df_list, ignore_index = True)

# replace nan, empty values as 0
foreign_visitors.replace(" ", 0, inplace= True)
#foreign_visitors.fillna(0, inplace = True)

#save it as csv
foreign_visitors.to_csv('foreign_visitors.csv', index = False)

print(foreign_visitors.info())
print(foreign_visitors.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1512 entries, 0 to 1511
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   district  1512 non-null   object
 1   date      1512 non-null   object
 2   month     1512 non-null   object
 3   year      1512 non-null   int64 
 4   visitors  1512 non-null   object
dtypes: int64(1), object(4)
memory usage: 59.2+ KB
None
   district        date     month  year visitors
0  Adilabad  01-01-2016   January  2016        2
1  Adilabad  01-02-2016  February  2016        0
2  Adilabad  01-03-2016     March  2016        2
3  Adilabad  01-04-2016     April  2016        0
4  Adilabad  01-05-2016       May  2016        0
