# Import Libraries

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import os 

## Read CSV File / Import dataset

In [None]:
co2_df = pd.read_csv("/Users/rojaywhite/Desktop/CO2 EDA /res/Data.csv")
print('Rows: {} | Col: {}'.format(co2_df.shape[0],co2_df.shape[1]))

## Exploring CO2 Dataset [EDA]

* 1st Make a copy of the original dataset

In [None]:
co2dfCopy = co2_df.copy(deep="true")
co2dfCopy.head()
print("Original Dataframe: 'co2_df'")
print(co2_df.head())
print("\n________________________________________________________________\n")
print("Copied Dataframe: 'co2dfCopy'")
print(co2dfCopy.head())

In [None]:
co2dfCopy.head()

In [None]:
co2dfCopy.info()

In [None]:
#viewing columns in the dataframe
pd.DataFrame({"Columns": co2dfCopy.columns})

In [None]:
#checking data types
data_types = dict(co2dfCopy.dtypes)
print(data_types)

In [None]:
#checking for unique values within the columns
cols = co2dfCopy[["Car", "Model", "Volume", "Weight", "CO2",]]
cols.nunique()

In [None]:
#selecting specific columns and assigning them to to dataframe labeled "vehicle"
vehicle = co2dfCopy[["Car", "Model", "CO2"]]
vehicle

In [None]:
#checking for any unique car models
co2dfCopy["Model"].unique()

In [None]:
#filtering for specific car models
co2dfCopy[ (co2dfCopy["Model"] == "500") |(co2dfCopy["Model"] == "1")|(co2dfCopy["Model"] == "3")|(co2dfCopy["Model"] == "5")|(co2dfCopy["Model"] == "2")]

* these car models are numbers in contrast with the others which are named (observation)

* note, they are still "Object datatype"

In [None]:
data_types = dict(co2dfCopy.dtypes)
print(data_types)

In [None]:
#returns sample of dataframe in no specfic order
co2dfCopy.sample(n=10)

## Cleaning the Dataset
#### Nan Values in Dataset

In [None]:
#returns brief summary of dataframe
co2dfCopy.info()

In [None]:
co2dfCopy.isnull().sum()

* Column "Unnamed: 5" has all Null values

In [None]:
#rename a given column
co2dfCopy.rename(columns={"Unnamed: 5" : "Unnamed"}, inplace=True)

In [None]:
#fill empty rows with given value
co2dfCopy["Unnamed"].fillna(value="No name", inplace=True)
co2dfCopy["Unnamed"].sample(n=10)

* Lets drop the "Un-Named" column. there is no need for it

In [None]:
#drop the column "Unnamed"
co2dfCopy.drop('Unnamed', axis=1, inplace=True)
pd.DataFrame({"Columns in Dataframe": co2dfCopy.columns})

## Visualizations

In [None]:
co2dfCopy["Car"].value_counts().plot(kind="barh", title=" Vehicle Count")

In [None]:
co2dfCopy[(co2dfCopy["Car"]=="Mercedes") | (co2dfCopy["Car"]=="Skoda")].sort_values('Car')

In [None]:
#emissions dataframe consisting of only car and Co2 column
emissions = co2dfCopy[["Car", "CO2"]]

In [None]:
avg_emissions_by_car = emissions.groupby("Car")["CO2"].mean().reset_index().sort_values(by="CO2", ascending=True)
avg_emissions_by_car.head(n=10)

In [None]:

avg_emissions_by_car.describe()

In [None]:

total_emissions_by_cars = emissions.groupby("Car")["CO2"].sum().reset_index().sort_values(by="CO2", ascending=True)
total_emissions_by_cars.head(n=10)

In [None]:
total_emissions_by_cars.describe()

## Average Co2 Emissions Visualisation

In [None]:
from turtle import title

sns.set(style = "darkgrid")
plt.figure(figsize = (10,7))

sns.barplot(x = "CO2", 
            y = "Car", 
            data = avg_emissions_by_car).set(title="Co2 emissions per Car")

In [None]:
from turtle import title

sns.set(style = "darkgrid")
plt.figure(figsize = (15,7))

sns.barplot(x = "Car", 
            y = "CO2", 
            data = avg_emissions_by_car).set(title="Co2 emissions per Car")

#### In terms of avg emissions:
* Fiat is the lowest with an avg of 90 %
* Mercedes is the heighest with an avg of 108 %

In [None]:
#returns average distribution of given data
sns.displot(data=avg_emissions_by_car, kde=True)

## Total Co2 Emissions (Sum)

In [None]:
from turtle import title

sns.set(style = "darkgrid")
plt.figure(figsize = (10,7))

sns.barplot(x = "CO2", 
            y = "Car", 
            data = total_emissions_by_cars).set(title="Co2 emissions per Car")

In [None]:
sns.set(style = "darkgrid")
plt.figure(figsize = (15,7))

sns.barplot(x = "Car", 
            y = "CO2", 
            data = total_emissions_by_cars).set(title = "Sum Co2 emissions per car")

#### In terms of sum emissions:
* Fiat is the lowest with 90 total emissions
* Mercedes is the heighest with 528 total emissions

In [None]:
sum_highest_lowest_emissions = total_emissions_by_cars[(total_emissions_by_cars["CO2"] == 528) | (total_emissions_by_cars["CO2"] == 90)]
sum_highest_lowest_emissions

In [None]:
sns.set(style = "darkgrid")
plt.figure(figsize = (8,6))

sns.barplot(x = "Car", 
            y = "CO2", 
            data = sum_highest_lowest_emissions,).set(title = " 'Sum' of Heighest vs Lowest emission")

## Average Weight by Car Brand

In [None]:
#dataframe with containing car column and weight column only
car_weight = co2dfCopy[['Car', 'Weight']]
car_weight.head(n=10)

In [None]:
avg_car_weight= car_weight.groupby('Car')['Weight'].mean().reset_index().sort_values(by='Weight')
avg_car_weight.head(n=10)

In [None]:
from turtle import title

sns.set(style = "darkgrid")
plt.figure(figsize = (10,7))

sns.barplot(x = "Weight", 
            y = "Car", 
            data = avg_car_weight).set(title="Average Weight per Car")

In [None]:
sns.displot(data= avg_car_weight, kde= True)

In [None]:
co2dfCopy.sample(n=5)

In [None]:
#Displays a bar chart of number of cars per Model
Models = co2dfCopy[["Car", "Model"]]
cars_by_Models = Models.groupby("Model")["Car"].count().reset_index().sort_values(by="Car", ascending=True)
cars_by_Models

sns.set(style = "darkgrid")
plt.figure(figsize = (30,13))

sns.barplot(x = 'Car', 
            y = 'Model', 
            data = cars_by_Models).set(title = "# of Cars per Model")

In [None]:
co2dfCopy[ (co2dfCopy["Model"] == "Fiesta") ]

In [None]:
co2dfCopy["Weight"].describe()

In [None]:
heaviest_Smallest = co2dfCopy[(co2dfCopy["Weight"] == 790)|(co2dfCopy["Weight"] == 1746)]
print("The smallest & heaviest vehicles in the set are: \n\n",heaviest_Smallest)

In [None]:
#how the total weight is destibuted
sns.displot(co2dfCopy['Weight'], kde=True, bins=10)

* most of the total weight lies somewhere between 1200 & 1600

In [None]:
total_emissions_by_cars

In [None]:
#note "kde" stands for Kernel Desnsity Estimation plots
sns.displot(total_emissions_by_cars['CO2'], kde=True)

In [None]:
sns.jointplot(x='Weight', y='CO2', data=co2dfCopy)

In [None]:
sns.pairplot(co2dfCopy, hue='Car')