**A significant part of a data scientist's role is to explore, analyze, and visualize data.**

In [1]:
import numpy as np
import pandas as pd

In [2]:
study_hours = [10.0,11.5,9.0,16.0,9.25,1.0,11.5,9.0,8.5,14.5,15.5,
               13.75,9.0,8.0,15.5,8.0,9.0,6.0,10.0,12.0,12.5,12.0]

grades = [50,50,47,97,49,3,53,42,26,74,82,62,37,15,70,27,36,35,48,52,63,64]

student_data = np.array([study_hours, grades])

In [3]:
# Get the mean value of each sub-array
avg_study = student_data[0].mean()
avg_grade = student_data[1].mean()

print('Average study hours: {:.2f}\nAverage grade: {:.2f}'.format(avg_study, avg_grade))

Average study hours: 10.52
Average grade: 49.18


In [4]:
import pandas as pd

df_students = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic', 'Jimmie', 
                                     'Rhonda', 'Giovanni', 'Francesca', 'Rajab', 'Naiyana', 'Kian', 'Jenny',
                                     'Jakeem','Helena','Ismat','Anila','Skye','Daniel','Aisha'],
                            'StudyHours':student_data[0],
                            'Grade':student_data[1]})

df_students

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0
6,Frederic,11.5,53.0
7,Jimmie,9.0,42.0
8,Rhonda,8.5,26.0
9,Giovanni,14.5,74.0


In [5]:
# Get the rows with index values from 0 to 5: loc: INDEX POSITIONS
df_students.loc[0:5]

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0


In [6]:
# Get data in the first five rows: iloc----->DOESNT INCLUDE UPPER BOUND VALUE
df_students.iloc[0:5]

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0


In [7]:
# Fetch column using iloc
df_students.iloc[0:5,2]

0    50.0
1    50.0
2    47.0
3    97.0
4    49.0
Name: Grade, dtype: float64

In [8]:
# Fetch column using loc
df_students.loc[0:5,'Grade']

0    50.0
1    50.0
2    47.0
3    97.0
4    49.0
5     3.0
Name: Grade, dtype: float64

In [9]:
# fetch a row from dataframe
df_students[df_students.Name == 'Aisha']

Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


# Reading data from file

In [10]:
flight_data = pd.read_csv('flights.csv')
flight_data.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


# Handling missing values (Data Cleaning)

In [11]:
# check where we have null values

flight_data.isnull().sum()

Year                     0
Month                    0
DayofMonth               0
DayOfWeek                0
DepTime               1142
CRSDepTime               0
ArrTime               1302
CRSArrTime               0
UniqueCarrier            0
FlightNum                0
TailNum               1142
ActualElapsedTime     1302
CRSElapsedTime           0
AirTime               1302
ArrDelay              1302
DepDelay              1142
Origin                   0
Dest                     0
Distance                 0
TaxiIn                1302
TaxiOut               1142
Cancelled                0
CancellationCode     98858
Diverted                 0
CarrierDelay         80371
WeatherDelay         80371
NASDelay             80371
SecurityDelay        80371
LateAircraftDelay    80371
dtype: int64

In [12]:
# to get the details and datatypes

flight_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Year               100000 non-null  int64  
 1   Month              100000 non-null  int64  
 2   DayofMonth         100000 non-null  int64  
 3   DayOfWeek          100000 non-null  int64  
 4   DepTime            98858 non-null   float64
 5   CRSDepTime         100000 non-null  int64  
 6   ArrTime            98698 non-null   float64
 7   CRSArrTime         100000 non-null  int64  
 8   UniqueCarrier      100000 non-null  object 
 9   FlightNum          100000 non-null  int64  
 10  TailNum            98858 non-null   object 
 11  ActualElapsedTime  98698 non-null   float64
 12  CRSElapsedTime     100000 non-null  int64  
 13  AirTime            98698 non-null   float64
 14  ArrDelay           98698 non-null   float64
 15  DepDelay           98858 non-null   float64
 16  Ori

In [13]:
# Dropping the "Cancellation Code" column from dataframe as maximum are null values

flight_data.drop('CancellationCode', axis = 1,inplace=True)
flight_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Year               100000 non-null  int64  
 1   Month              100000 non-null  int64  
 2   DayofMonth         100000 non-null  int64  
 3   DayOfWeek          100000 non-null  int64  
 4   DepTime            98858 non-null   float64
 5   CRSDepTime         100000 non-null  int64  
 6   ArrTime            98698 non-null   float64
 7   CRSArrTime         100000 non-null  int64  
 8   UniqueCarrier      100000 non-null  object 
 9   FlightNum          100000 non-null  int64  
 10  TailNum            98858 non-null   object 
 11  ActualElapsedTime  98698 non-null   float64
 12  CRSElapsedTime     100000 non-null  int64  
 13  AirTime            98698 non-null   float64
 14  ArrDelay           98698 non-null   float64
 15  DepDelay           98858 non-null   float64
 16  Ori

In [14]:
# Filling the default value for the rows which are NULL for TailNum column

flight_data['TailNum'].fillna('UNKNOWN',inplace=True)
flight_data.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,810,4.0,8.0,0,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,810,5.0,10.0,0,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,515,3.0,17.0,0,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,515,3.0,7.0,0,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,515,3.0,10.0,0,0,2.0,0.0,0.0,0.0,32.0


In [15]:
# we checked for all datatypes for null values. Here we get null values for float64 datatypes

float_columns = flight_data.select_dtypes(include=['float64']).columns
missing_values_count = flight_data[float_columns].isnull().sum()
missing_values_count

DepTime               1142
ArrTime               1302
ActualElapsedTime     1302
AirTime               1302
ArrDelay              1302
DepDelay              1142
TaxiIn                1302
TaxiOut               1142
CarrierDelay         80371
WeatherDelay         80371
NASDelay             80371
SecurityDelay        80371
LateAircraftDelay    80371
dtype: int64

In [16]:
# getting the mean value from each column and adding it to NULL values

flight_data_filled = flight_data.copy() # getting the copy of original as inplace not working
mean_values = flight_data[float_columns].mean()
flight_data_filled[float_columns] = flight_data[float_columns].fillna(mean_values)

In [17]:
# All the columns have the values

flight_data_filled.isnull().sum()

Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
UniqueCarrier        0
FlightNum            0
TailNum              0
ActualElapsedTime    0
CRSElapsedTime       0
AirTime              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Distance             0
TaxiIn               0
TaxiOut              0
Cancelled            0
Diverted             0
CarrierDelay         0
WeatherDelay         0
NASDelay             0
SecurityDelay        0
LateAircraftDelay    0
dtype: int64

# Explore data in the DataFrame

In [18]:
import matplotlib.pyplot as plt
import seaborn as sns