# Exploratory Data Analysis

In [1]:

!pip install sweetviz

In [2]:
#load the libraries
import pandas as pd
import numpy as np
import sweetviz as sv

In [3]:
data1 = pd.read_csv("../input/dataclean123/data_clean.csv")

In [4]:
data1.tail(10)

In [5]:
data1

In [6]:
#Data Structure 
type(data1)
data1.shape

In [7]:
#data types
data1.dtypes

# Data type conversion

In [8]:
data1.info()

In [9]:
data2=data1.iloc[:,1:]

In [10]:
#The method .copy() is used here so that any changes made in new DataFrame don't get reflected in the original one
data=data2.copy()

In [11]:
data['Month']=pd.to_numeric(data['Month'],errors='coerce')
data['Temp C']=pd.to_numeric(data['Temp C'],errors='coerce')# coerce will introduce NA values for non numeric data in the columns
data['Weather']=data['Weather'].astype('category')           #data['Wind']=data['Wind'].astype('int64')

In [12]:
data.info()

# Duplicates

In [13]:
#Count of duplicated rows
data[data.duplicated()].shape

In [14]:
data

In [15]:
#Print the duplicated rows
data[data.duplicated()]

In [16]:
data_cleaned1=data.drop_duplicates()

In [17]:
data_cleaned1

# Drop columns

In [18]:
data_cleaned2=data_cleaned1.drop('Temp C',axis=1)

In [19]:
data_cleaned2

# Rename the columns

In [20]:
#rename the Solar column
data_cleaned3 = data_cleaned2.rename({'Solar.R': 'Solar'}, axis=1)

In [21]:
data_cleaned3

# Outlier Detection

In [22]:
# histogram of Ozone
data_cleaned3['Ozone'].hist()

In [23]:
#Box plot
data_cleaned3.boxplot(column=['Ozone'])

In [24]:
#Descriptive stat
data_cleaned3['Ozone'].describe()

In [25]:
data_cleaned3

In [26]:
#Bar plot
data['Weather'].value_counts().plot.bar()

# Missing Values and Imputation

In [27]:
import seaborn as sns
cols = data_cleaned3.columns 
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(data_cleaned3[cols].isnull(),
            cmap=sns.color_palette(colours))

In [28]:
data_cleaned3[data_cleaned3.isnull().any(axis=1)].head()

In [29]:
data_cleaned3.isnull().sum()

In [30]:
#Mean Imputation
mean = data_cleaned3['Ozone'].mean()
print(mean)

In [31]:
data_cleaned3['Ozone'] = data_cleaned3['Ozone'].fillna(mean)

In [32]:
data_cleaned3

In [33]:
#Missing value imputation for categorical vlaue
#Get the object columns
obj_columns=data_cleaned3[['Weather']]

In [34]:
obj_columns.isnull().sum()

In [35]:
#Missing value imputation for categorical vlaue
obj_columns=obj_columns.fillna(obj_columns.mode().iloc[0])

In [36]:
data_cleaned3.shape

In [37]:
obj_columns.shape

In [38]:
#Join the data set with imputed object dataset
data_cleaned4=pd.concat([data_cleaned3,obj_columns],axis=1)

In [39]:
data_cleaned4

# Scatter plot and Correlation analysis

In [None]:
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(data_cleaned3)

In [None]:
#Correlation

data_cleaned3.corr()


# Transformations

# Dummy Variable

In [None]:
#Creating dummy variable for Weather column
data_cleaned4=pd.get_dummies(data,columns=['Weather'])

In [None]:
data_cleaned4

In [None]:
data_cleaned4=data_cleaned4.dropna()

In [None]:
#Normalization of the data
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler

In [None]:
data_cleaned4.values

In [None]:

array = data_cleaned3.values

scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(array[:,0:5])

# summarize transformed data
set_printoptions(precision=2)
print(rescaledX[0:5,:])


In [None]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler

In [None]:
array = data_cleaned4.values
scaler = StandardScaler().fit(array)
rescaledX = scaler.transform(array)

# summarize transformed data
set_printoptions(precision=2)
print(rescaledX[0:5,:])

# Speed up the EDA process

In [None]:
sweet_report = sv.analyze(data)
sweet_report.show_html('weather_report.html')