In [None]:
#Libraries for Data Analysis & Wrangling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
import os

#importing csv file using pandas and read_csv command. 
NY_Data = pd.read_csv('D:\AB_NYC_2019.csv')

#head() method is used to print top 5 rows & tail() method is used to print last 5 row
NY_Data.head().append(NY_Data.tail())

In [None]:
#printing the dataset
print(NY_Data)


In [None]:
#printing the information of the NYC dataset using info() method
print(NY_Data.info())

In [None]:
#getting dataypes using dtypes() method
print(NY_Data.dtypes)

In [None]:
#statistics of the given dataest
# we use describe() method
print(NY_Data.describe())

In [None]:
#finding null values using isnull() method
print(NY_Data.isnull())

In [None]:
#calculating the total null values dataset
print(NY_Data.isnull().sum())

#we've seen that there're 16 null values in names, 21 null values in host_name, 10052 null values in last_review & reviews_per_month

In [None]:
#Data cleaning

m_value = NY_Data.isnull().sum().sort_values(ascending=False)

per_missing_values = round((NY_Data.isnull().sum()/NY_Data.isnull().sum().sum()).sort_values(ascending=False),3)

null_in_data = pd.concat([m_value, per_missing_values],keys=['Missing values','Percentage'],axis=1)
null_in_data.head(2)

#as we see that both reviews_per_month & last_review have same missing values.

In [None]:
#bar plot for missing values(reviews_per_month, last_review) using barplot() method

plt.figure(figsize=(10,5))
missing_plot = sns.barplot(x=m_value.index[0:2],y=m_value[0:2],palette='bright')
missing_plot.set_xticklabels(missing_plot.get_xticklabels(),rotation=0)
plt.title('Missing values in dataset.')
plt.show()

In [None]:
#As per my knowledge it's best to dropp columns that are not significant or could be unethical to use for our future data exploration and predictions
NY_Data.drop(['id','host_name','last_review'], axis=1, inplace=True)
NY_Data

In [None]:
#replacing all NaN values in 'reviews_per_month' with 0
NY_Data.fillna({'reviews_per_month':0}, inplace=True)
NY_Data.reviews_per_month.isnull().sum()

In [None]:
#let's proceed with examing some interesting categorical unique values
#examining the unique values of neighbourhood_group as this column will appear very handy for later analysis
#unique() function is used to get unique values of Series object. 
NY_Data.neighbourhood_group.unique()

In [None]:
#printing pie chart using pie() method

labels = NY_Data.neighbourhood_group.value_counts().index
colors = ['red','beige','yellow','orange','lightpink']
explode = [0,0,0,0,0]
sizes = NY_Data.neighbourhood_group.value_counts().values

plt.figure(0,figsize = (7,7))
plt.pie(sizes, explode=[0.1,0.0,0.3,0.5,0.0], labels=labels, colors=colors, autopct='%1.1f%%',shadow=True)
plt.title('Neighbourhood Group',color = 'black',fontsize = 15)
plt.show()

In [None]:
#neighbourhood_group-price
#printing barplot of price and neighbourhood_group
result = NY_Data.groupby(["neighbourhood_group"])['price'].aggregate(np.median).reset_index().sort_values('price')
sns.barplot(x='neighbourhood_group', y="price", data=nyc_data,palette=colors, order=result['neighbourhood_group']) 
plt.xticks(rotation=45)
plt.show()

#As we see that Manhattan has highest price

In [None]:
#countplot between rooms in different neighbourhood_group

plt.figure(figsize=(14,8))
sns.countplot(nyc_data.sort_values('neighbourhood_group').neighbourhood_group,hue=NY_Data.room_type)
plt.title('Borough wise room type count')
plt.xlabel('Borough name')
plt.ylabel('Count')
plt.show()


In [None]:
#printing the barplot between neighbourhood_group whose availability is 365 

result = NY_Data.groupby(["neighbourhood_group"])['availability_365'].aggregate(np.median).reset_index().sort_values('availability_365')
sns.boxplot(x='neighbourhood_group', y="availability_365", data=NY_Data) 
plt.show()

In [None]:
#Piechart for rooms

labels = NY_Data.room_type.value_counts().index
colors = ['skyblue','pink','yellow']
explode = [0,0,0]
sizes = NY_Data.room_type.value_counts().values


plt.figure(0,figsize = (7,7))
plt.pie(sizes, explode=[0,0.05,0.5], labels=labels, colors=colors, autopct='%1.1f%%', shadow=True)
# plot.pie(explode=,autopct='%1.1f%%',ax=ax[0],)
plt.title('Room-Type',color = 'Brown',fontsize = 15)
plt.show()

In [None]:
#room_type-price
result = NY_Data.groupby(["room_type"])['price'].aggregate(np.median).reset_index().sort_values('price')
sns.barplot(x='room_type', y="price", data=NY_Data, order=result['room_type']) 
plt.show()

In [None]:
# Lineplot between availability_365 & price using lineplot() method

sns.lineplot(x='availability_365',y='price',data=NY_Data)
plt.show()

In [None]:
#printing scatterplot using scatterplot() method

plt.figure(figsize=(10,6))
sns.scatterplot(NY_Data.longitude,NY_Data.latitude,hue=NY_Data.neighbourhood_group)
plt.ioff()

In [None]:

#Finding outliers
#Outliers -  An outlier is a data point in a data set that is distant from all other observations.A data point that lies outside the overall distribution of the dataset.
#we're using boxplot for finding outliers.

sns.boxplot(NY_Data['price'])

#As we seee that there're some values which are out of bound(for eg :- price lie betweens 8000-10000)