In [1]:
import pandas as pd
import numpy as np

## 1. Creating a Dataframe from the data

In [48]:
dtype_dict = {'CircleName': str, 'RegionName': str, 'DivisionName': str, 
              'OfficeName': str, 'Pincode': int, 'OfficeType': str, 
             'Delivery': str, 'District': str, 'StateName': str,
             'Latitude': str, 'Longitude': str}

In [49]:
df = pd.read_csv('pincodes.csv', dtype=dtype_dict)

In [50]:
df.head(4)

Unnamed: 0,CircleName,RegionName,DivisionName,OfficeName,Pincode,OfficeType,Delivery,District,StateName,Latitude,Longitude
0,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Peddakotla B.O,515631,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.5689,77.85624
1,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Pinnadhari B.O,515631,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.5281,77.857014
2,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Yerraguntapalle B.O,515631,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.561111,77.85715
3,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Obulareddipalli B.O,515581,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.2488,78.2588


## 2. Rows and Columns count

In [51]:
print("No of Rows: ", df.shape[0])
print("No of Columns: ", df.shape[1])

No of Rows:  157126
No of Columns:  11


## 3. Column Names

In [52]:
columnNames = df.columns.tolist();
print("Names of all the columns are as follows: ")
for names in columnNames:
    print(names)

Names of all the columns are as follows: 
CircleName
RegionName
DivisionName
OfficeName
Pincode
OfficeType
Delivery
District
StateName
Latitude
Longitude


## 4. Rows and Columns with missing values

In [53]:
rowNull = df.isnull().any(axis=1).sum()
colNull = df.isnull().any().sum()

In [54]:
print(f'Number of rows with missing values: {rowNull}')
print(f'Number of columns with missing values: {colNull}')

Number of rows with missing values: 8900
Number of columns with missing values: 4


In [55]:
df.head(6)

Unnamed: 0,CircleName,RegionName,DivisionName,OfficeName,Pincode,OfficeType,Delivery,District,StateName,Latitude,Longitude
0,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Peddakotla B.O,515631,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.5689,77.85624
1,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Pinnadhari B.O,515631,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.5281,77.857014
2,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Yerraguntapalle B.O,515631,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.561111,77.85715
3,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Obulareddipalli B.O,515581,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.2488,78.2588
4,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Odulapalli B.O,515581,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.24555,78.2477
5,Andhra Pradesh Circle,Kurnool Region,Hindupur Division,Peddannavaripalli B.O,515581,BO,Delivery,ANANTAPUR,ANDHRA PRADESH,14.2888,78.2777


### 2. Dropping rows with missing values and resetting the indices of the data frame.

In [56]:
df = df.dropna()

In [57]:
df = df.reset_index(drop=True)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148226 entries, 0 to 148225
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CircleName    148226 non-null  object
 1   RegionName    148226 non-null  object
 2   DivisionName  148226 non-null  object
 3   OfficeName    148226 non-null  object
 4   Pincode       148226 non-null  int32 
 5   OfficeType    148226 non-null  object
 6   Delivery      148226 non-null  object
 7   District      148226 non-null  object
 8   StateName     148226 non-null  object
 9   Latitude      148226 non-null  object
 10  Longitude     148226 non-null  object
dtypes: int32(1), object(10)
memory usage: 11.9+ MB


### 3. Cleaning the data and converting Longitude and Latitude into Float type

In [59]:
df['Latitude'] = df['Latitude'].astype(float)
df['Longitude'] = df['Longitude'].astype(float)

ValueError: could not convert string to float: '21.9161 N'

#### We encouter an error as some part of the data as alphabets and spaces in them, which can't be converted into Float type directly, so we will handle those by converting them into float type values using regex expressions

In [60]:
import re

In [61]:
df['Latitude'] = df['Latitude'].astype(str)
df['Longitude'] = df['Longitude'].astype(str)

In [62]:
def clean_latitude(latitude):
    return re.sub('[^0-9,.]', '', latitude)

def clean_longitude(longitude):
    return re.sub('[^0-9,.]', '', longitude)

In [63]:
df['Latitude'] = df['Latitude'].apply(clean_latitude)
df['Longitude'] = df['Longitude'].apply(clean_longitude)

In [65]:
df['Latitude'] = df['Latitude'].astype(float)
df['Longitude'] = df['Longitude'].astype(float)

##### Latitude and Longitude values of random indices

In [68]:
import random
totalRows = len(df)
randomIndices = []  ##Random List

for i in range(5):
    randomIdx = random.randint(0, totalRows - 1)
    randomIndices.append(randomIdx)

In [73]:
for i in randomIndices:
    Latitude = df.at[i, 'Latitude']
    Longitude = df.at[i, 'Longitude']
    District = df.at[i, 'District']
    Pincode = df.at[i, 'Pincode']
    print(f"At index {i}: District = {District}, Pincode = {Pincode}, Latitude = {Latitude}, Longitude = {Longitude}")

At index 102803: District = JAUNPUR, Pincode = 222133, Latitude = 25.64, Longitude = 82.97
At index 103461: District = Amethi, Pincode = 229304, Latitude = 31.2, Longitude = 75.46
At index 58858: District = BETUL, Pincode = 460666, Latitude = 21.678233, Longitude = 78.182132
At index 1432: District = Y.S.R., Pincode = 516501, Latitude = 14.63, Longitude = 79.06
At index 109187: District = PITHORAGARH, Pincode = 262522, Latitude = 29.5746, Longitude = 80.0657
