# Exploratory Data Analysis

In [18]:
#load the libraries
import pandas as pd

Reading the csv file and displaying first 5 rows of data to get a quick look at it

In [16]:
data = pd.read_csv("data_clean.csv")# reading csv file
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data_clean.csv'

In [None]:

#displaying the structure of data

data.info()

In [None]:
#Data Structure(checking the shape and type of data)
print(type(data))
print(data.shape)

In [None]:
#data types of columns of dataframe
data.dtypes

In [None]:
# displaying summary statistics of numerical columns
data.describe()

### This is comment

# Data type conversion

In [None]:

# Check how many null values are in the 'Month' column
data['Month'].isnull().sum()

In [None]:
data['Ozone'].isnull().sum()

In [None]:
data['Month'].head(25)

In [None]:
# following line will show an error
data['Month']=pd.to_numeric(data['Month'])

In [None]:

# Convert the 'Month' column to numeric, coercing any errors (invalid parsing) into NaN values
data['Month']=pd.to_numeric(data['Month'],errors='coerce')

In [None]:
# Convert the 'Month' column to numeric, coercing errors into NaN for invalid values.
# Example: If the original values were [1, 1.5, 'NA'], it converts non-numeric entries to NaN.
data['Month'] = pd.to_numeric(data['Month'], errors='coerce')

# Convert the 'Temp C' column to numeric. Invalid entries will be coerced to NaN.
# This ensures all temperature values are numeric.
data['Temp C'] = pd.to_numeric(data['Temp C'], errors='coerce')

# Convert the 'Weather' column to a categorical data type, which is useful for columns with a fixed number of unique values.
data['Weather'] = data['Weather'].astype('category')

# Convert the 'Wind' column to an integer data type (assuming the 'Wind' column is purely numeric).
# data['Wind'] = data['Wind'].astype('int64')
# Uncomment this line if the 'Wind' column should be integers, but make sure there are no NaN values before doing so.

In [None]:

#rechecking structure of data to verify the change in dtypes of columns
data.info()

####More on data type conversions
[link text](https://stackoverflow.com/questions/15891038/change-column-type-in-pandas)

# Duplicates

In [None]:
# Check for duplicate rows in the DataFrame.
# Returns a Boolean Series where 'True' indicates a duplicate row.
# The '.head(40)' displays the first 40 rows to show which are duplicates.
data.duplicated().head(40)

In [None]:
#Count of duplicated rows
data[data.duplicated()].shape

In [None]:
#Print the duplicated rows
data[data.duplicated()]

In [None]:

# drops duplicated rows
data_cleaned1=data.drop_duplicates()

In [None]:

# shape of my cleaned dataset
data_cleaned1.shape

# Drop columns

In [None]:
data

In [None]:
# Dropping the 'Unnamed: 0' and 'Temp C' columns from the DataFrame 'data_cleaned1'
# 'axis=1' indicates that we are dropping columns (not rows).
# The result is stored in a new DataFrame named 'data_cleaned2'.
data_cleaned2 = data_cleaned1.drop(['Unnamed: 0', 'Temp C'], axis=1)

# Display the resulting 'data_cleaned2' DataFrame after the specified columns are removed.
data_cleaned2

# Rename the columns

In [None]:
#rename the Solar column
data_cleaned3 = data_cleaned2.rename({'Solar.R': 'Solar'}, axis=1)

In [None]:

# checking if column was renamed
data_cleaned3

# Missing Values and Imputation

In [None]:

# Get the column names from the 'data_cleaned3' DataFrame
cols = data_cleaned3.columns
cols

In [None]:


# Import the Seaborn library for data visualization
import seaborn as sns
# Define a color palette for the heatmap:
# '#000099' (blue) represents non-missing values,
# '#ffff00' (yellow) represents missing values (NaN)
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.

# Create a heatmap using Seaborn to visualize missing values in the DataFrame 'data_cleaned3'.
# The 'isnull()' method returns a DataFrame of booleans indicating NaN values.
# 'sns.color_palette(colours)' applies the defined color palette
sns.heatmap(data_cleaned3[cols].isnull(),
            cmap=sns.color_palette(colours))

In [None]:


# Display the first few rows of 'data_cleaned3' where any NaN values are present in the row
data_cleaned3[data_cleaned3.isnull().any(axis=1)].head()

In [None]:


# Display the first few rows of the 'data_cleaned3' DataFrame

data_cleaned3.head()

In [None]:
# Use backward fill ('bfill') to fill missing values in 'data_cleaned3'.
# It fills NaN values by propagating the next valid value backward.
df_fill = data_cleaned3.bfill()


In [None]:
# Display the first few rows of the DataFrame 'df_fill' after applying the backward fill.
df_fill.head()


In [None]:
data_cleaned3.head() # original remains same , if you want to change original , make inplace=True

In [None]:
# Check if there are any remaining missing values in 'data_cleaned3' by summing the null values in each column.
data_cleaned3.isnull().sum()

In [None]:
# Mean Imputation for the 'Ozone' column
# Calculate the mean of the 'Ozone' column, ignoring NaN values.
m = data_cleaned3['Ozone'].mean()
print(m)

In [None]:
# Fill the missing values in the 'Ozone' column with the calculated mean.
data_cleaned3['Ozone'] = data_cleaned3['Ozone'].fillna(m)


In [None]:
# Check if any missing values remain in the 'Ozone' column after mean imputation.
data_cleaned3['Ozone'].isnull().sum()


In [None]:
# Missing value imputation for categorical values
# Get the object (categorical) columns from the DataFrame.

# Extract the 'Weather' column (which is categorical) for imputation.
obj_columns = data_cleaned3[['Weather']]


In [None]:
# Check for missing values in the extracted categorical columns.
obj_columns.isnull().sum()


In [None]:
# Display the first few rows of the categorical column 'Weather' to inspect.
obj_columns.head()

In [None]:
# Get the mode (most frequent value) of the 'Weather' column.
# The mode is used for imputing missing values in categorical columns.
data_cleaned3[['Weather']].mode()

In [None]:
# Extract the mode value of the 'Weather' column using .iloc[0] to get the first mode in case of multiple modes.
z = obj_columns.mode().iloc[0]
print(z)  # Display the mode value for inspection.

In [None]:
# Fill missing values in the 'Weather' column with the mode value (z).
obj_columns = obj_columns.fillna(z)

In [None]:
# Check if any missing values remain in the 'Weather' column after mode imputation.
obj_columns.isnull().sum()


In [None]:
# Update the 'Weather' column in the original 'data_cleaned3' DataFrame with the imputed data.
data_cleaned3['Weather'] = obj_columns


In [None]:
#boxplot of columns of cleaned dataset
data_cleaned3.boxplot()

In [None]:
# Join the original 'data_cleaned3' DataFrame with the imputed 'obj_columns' (categorical columns)
# 'axis=1' indicates column-wise concatenation.
data_cleaned4 = pd.concat([data_cleaned3, obj_columns], axis=1)

In [None]:
# Check for any remaining missing values in the newly combined 'data_cleaned4' DataFrame.
data_cleaned4.isnull().sum()

In [None]:
data_cleaned4

In [None]:
#Bar plot
data['Weather'].value_counts().plot.bar()

# Outlier Detection

In [None]:
# histogram of Ozone
data_cleaned3['Ozone'].hist()

In [None]:
#Box plot
import matplotlib.pyplot as plt
data_box=data_cleaned3.dropna()
data1_box=data_box.Ozone
plt.boxplot(data1_box)

In [None]:
dir(data_box)

In [None]:
# Create a boxplot for the 'Ozone' column (excluding NaN values) to visualize outliers.
box = plt.boxplot(data['Ozone'].dropna())

In [None]:
# Check the type of the boxplot object to understand its structure.
type(box)

# Iterate through the dictionary-like boxplot object to inspect its components.
# The 'box' object contains information about the elements of the boxplot (like whiskers, fliers, etc.).
for i, j in box.items():
    print(i, j)

In [None]:
[item.get_ydata() for item in box['fliers']] #fliers are outliers

In [None]:
#To get the whiskers
[item.get_ydata()[1] for item in box['whiskers']]

In [None]:
#Descriptive stat of ozone
data_cleaned3['Ozone'].describe()

# Scatter plot and Correlation analysis

In [None]:
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
pd.plotting.scatter_matrix(data_cleaned3)
sns.pairplot(data_cleaned3)

In [None]:
#Correlation matrix generation
data_cleaned3.corr()

# Transformations

#### Dummy Variable

In [None]:
data_cleaned3

In [None]:
#Creating dummy variables for Weather column
data_cleaned4=pd.get_dummies(data_cleaned3,columns=['Weather'])

In [None]:

# checking if dummy variables were created
data_cleaned4

In [None]:
data_cleaned4=data_cleaned4.dropna()

#### Normalization of the data

In [None]:
#Normalization of the data
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler

In [None]:
array = data_cleaned4.values
array

In [None]:

# Initialize the MinMaxScaler, which scales the data to a given range (here, 0 to 1).
scaler = MinMaxScaler(feature_range=(0, 1))

# Apply the MinMax scaling to the input array 'array'.
# This will transform all features to the range [0, 1].
rescaledX = scaler.fit_transform(array)

In [None]:
#transformed data
set_printoptions(precision=2)
print(rescaledX[0:5,:])


In [None]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler

In [None]:
array = data_cleaned4.values
scaler = StandardScaler()
scaler.fit(array)
rescaledX = scaler.transform(array)

# summarize transformed data
set_printoptions(precision=2)
print(rescaledX[0:5,:])