## Extracting Data from Pandas dataframe to MySql Table

#### Here's a typical project flow:
####          1.Data Cleaning in Python (Using Pandas)
####          2.Store Cleaned Data in SQL database
####          3.Perform SQL operations

In [1]:
# Importing Dependencies

import pandas as pd
import mysql.connector
from sqlalchemy import create_engine

## Data

In [2]:
df = pd.read_csv('./Employee Data.csv', encoding='latin-1')

#if you got trouble while reading csv file mention encoding as 'latin-1'

In [3]:
df.head()

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus %,Country,City,Exit Date
0,E02002,Kai Le,Controls Engineer,Engineering,Manufacturing,Male,Asian,47.0,2/5/2022,"$92,368",0%,United States,Columbus,
1,E02003,Robert Patel,Analyst,Sales,Corporate,Male,Asian,58.0,10/23/2013,"$45,703",0%,United States,Chicago,
2,E02004,Cameron Lo,Network Administrator,IT,Research & Development,Male,Asian,34.0,3/24/2019,"$83,576",0%,China,Shanghai,
3,E02005,Harper Castillo,IT Systems Architect,IT,Corporate,Female,Latino,39.0,4/7/2018,"$98,062",0%,United States,Seattle,
4,E02006,Harper Dominguez,Director,Engineering,Corporate,Female,Latino,42.0,6/18/2005,"$175,391",24%,United States,Austin,


In [4]:
df.tail()

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus %,Country,City,Exit Date
1257,E02250,Mila Han,Manager,Sales,Manufacturing,Female,Asian,54.0,11/14/2009,"$128,791",6%,United States,Miami,
1258,E02251,Genesis Herrera,Manager,IT,Research & Development,Female,Latino,34.0,10/3/2015,"$126,898",10%,Brazil,Manaus,
1259,E02252,Olivia Vazquez,Network Engineer,IT,Specialty Products,Female,Latino,53.0,4/13/2020,"$93,053",0%,Brazil,Sao Paulo,
1260,E02253,Leilani Ng,Systems Analyst,IT,Corporate,Female,Asian,48.0,9/19/2011,"$50,513",0%,United States,Seattle,10/30/2019
1261,E02254,Olivia Mendoza,Sr. Account Representative,Sales,Corporate,Female,Latino,43.0,5/7/2017,"$86,533",0%,United States,Columbus,


In [5]:
date_cols = ['Hire Date', 'Exit Date']

for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [7]:
df['Bonus %'] = pd.to_numeric(df['Bonus %'].str.rstrip('%'))
df.rename(columns={'Bonus %' : 'Bonus'}, inplace=True)

In [10]:
df['Annual Salary'] = pd.to_numeric(df['Annual Salary'].str.replace('[^\d]', '', regex=True), errors='coerce')

In [13]:
df['Age'] = df['Age'].astype('Int64')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1262 entries, 0 to 1261
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Employee ID    1243 non-null   object        
 1   Full Name      1222 non-null   object        
 2   Job Title      1204 non-null   object        
 3   Department     1215 non-null   object        
 4   Business Unit  1180 non-null   object        
 5   Gender         1213 non-null   object        
 6   Ethnicity      1220 non-null   object        
 7   Age            1256 non-null   Int64         
 8   Hire Date      1227 non-null   datetime64[ns]
 9   Annual Salary  1189 non-null   float64       
 10  Bonus          1214 non-null   float64       
 11  Country        1156 non-null   object        
 12  City           1207 non-null   object        
 13  Exit Date      125 non-null    datetime64[ns]
dtypes: Int64(1), datetime64[ns](2), float64(2), object(9)
memory usage: 139.

In [None]:
df

In [23]:
duplicates = df[df.duplicated('Employee ID', keep=False)]

In [24]:
sorted_dups = duplicates.sort_values('Employee ID')

In [22]:
df.drop_duplicates(subset='Employee ID', keep="first")

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
0,E02002,Kai Le,Controls Engineer,Engineering,Manufacturing,Male,Asian,47,2022-02-05,92368.0,0.0,United States,Columbus,NaT
1,E02003,Robert Patel,Analyst,Sales,Corporate,Male,Asian,58,2013-10-23,45703.0,0.0,United States,Chicago,NaT
2,E02004,Cameron Lo,Network Administrator,IT,Research & Development,Male,Asian,34,2019-03-24,83576.0,0.0,China,Shanghai,NaT
3,E02005,Harper Castillo,IT Systems Architect,IT,Corporate,Female,Latino,39,2018-04-07,98062.0,0.0,United States,Seattle,NaT
4,E02006,Harper Dominguez,Director,Engineering,Corporate,Female,Latino,42,2005-06-18,175391.0,24.0,United States,Austin,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,E02999,Layla Wu,,Sales,Research & Development,Female,Asian,31,2018-08-27,,0.0,United States,,NaT
998,E03000,Thomas Lam,,Engineering,Manufacturing,Male,Asian,52,2022-11-18,,,United States,,NaT
999,E03001,Willow Taylor,,Sales,Corporate,Female,Black,27,2021-02-08,51321.0,,,Seattle,NaT
1016,E02960,Lucas Sandoval,Sr. Analyst,Sales,Manufacturing,Male,Latino,41,2011-05-19,84627.0,0.0,Brazil,Manaus,NaT


In [67]:
df['Age'] = df['Age'].astype("Int64")

In [68]:
df.head()

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
0,E02002,Kai Le,Controls Engineer,Engineering,Manufacturing,Male,Asian,47,2022-02-05,"$92,368",0.0,United States,Columbus,NaT
1,E02003,Robert Patel,Analyst,Sales,Corporate,Male,Asian,58,2013-10-23,"$45,703",0.0,United States,Chicago,NaT
2,E02004,Cameron Lo,Network Administrator,IT,Research & Development,Male,Asian,34,2019-03-24,"$83,576",0.0,China,Shanghai,NaT
3,E02005,Harper Castillo,IT Systems Architect,IT,Corporate,Female,Latino,39,2018-04-07,"$98,062",0.0,United States,Seattle,NaT
4,E02006,Harper Dominguez,Director,Engineering,Corporate,Female,Latino,42,2005-06-18,"$175,391",24.0,United States,Austin,NaT


In [75]:
df[df['Age'].isnull()]

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
795,E02797,Autumn Kaur,,Accounting,,Female,Asian,,NaT,"$129,172",,China,Beijing,NaT
867,E02869,Aubrey Cheng,Analyst II,Accounting,,Female,Asian,,2001-06-17,"$64,882",0.0,United States,,NaT
871,E02873,Asher Bui,Sr. Account Representative,Sales,Specialty Products,Male,,,2009-01-27,"$90,881",0.0,,Shanghai,NaT
883,E02885,Colton Adams,Manager,Marketing,Specialty Products,Male,Caucasian,,NaT,,8.0,United States,Columbus,NaT
1090,E02504,Victoria Vo,Field Engineer,Engineering,Corporate,Female,,,2017-12-18,"$82,241",0.0,China,Shanghai,NaT
1112,E02210,Liliana Collins,Sr. Manager,Human Resources,Specialty Products,Female,Black,,2013-01-18,"$151,666",13.0,United States,Chicago,2018-06-30


In [83]:
df['Annual Salary'] = df['Annual Salary'].str.lstrip('$')

In [90]:
df['Annual Salary'] = pd.to_numeric(df['Annual Salary'].str.replace('[^\d]', '', regex = True), errors='coerce')

In [93]:
df['Annual Salary']

0        92368.0
1        45703.0
2        83576.0
3        98062.0
4       175391.0
          ...   
1257    128791.0
1258    126898.0
1259     93053.0
1260     50513.0
1261     86533.0
Name: Annual Salary, Length: 1262, dtype: float64

In [94]:
df.head()

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
0,E02002,Kai Le,Controls Engineer,Engineering,Manufacturing,Male,Asian,47,2022-02-05,92368.0,0.0,United States,Columbus,NaT
1,E02003,Robert Patel,Analyst,Sales,Corporate,Male,Asian,58,2013-10-23,45703.0,0.0,United States,Chicago,NaT
2,E02004,Cameron Lo,Network Administrator,IT,Research & Development,Male,Asian,34,2019-03-24,83576.0,0.0,China,Shanghai,NaT
3,E02005,Harper Castillo,IT Systems Architect,IT,Corporate,Female,Latino,39,2018-04-07,98062.0,0.0,United States,Seattle,NaT
4,E02006,Harper Dominguez,Director,Engineering,Corporate,Female,Latino,42,2005-06-18,175391.0,24.0,United States,Austin,NaT


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1262 entries, 0 to 1261
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Employee ID    1243 non-null   object        
 1   Full Name      1222 non-null   object        
 2   Job Title      1204 non-null   object        
 3   Department     1215 non-null   object        
 4   Business Unit  1180 non-null   object        
 5   Gender         1213 non-null   object        
 6   Ethnicity      1220 non-null   object        
 7   Age            1256 non-null   Int64         
 8   Hire Date      1227 non-null   datetime64[ns]
 9   Annual Salary  1189 non-null   float64       
 10  Bonus          1214 non-null   float64       
 11  Country        1156 non-null   object        
 12  City           1207 non-null   object        
 13  Exit Date      125 non-null    datetime64[ns]
dtypes: Int64(1), datetime64[ns](2), float64(2), object(9)
memory usage: 139.

In [98]:
df['Employee ID'].isnull().sum()

19

In [99]:
# 1262
2254-2002

252

In [102]:
df['Employee ID'].duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1257     True
1258     True
1259     True
1260     True
1261     True
Name: Employee ID, Length: 1262, dtype: bool

In [103]:
df['Employee ID']

0       E02002
1       E02003
2       E02004
3       E02005
4       E02006
         ...  
1257    E02250
1258    E02251
1259    E02252
1260    E02253
1261    E02254
Name: Employee ID, Length: 1262, dtype: object

In [104]:
df.tail(15)

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
1247,E02240,Ryan Yi,Solutions Architect,IT,Specialty Products,Male,Asian,56,2003-08-17,82758.0,0.0,United States,Seattle,NaT
1248,E02241,Carter Simmons,Manager,IT,Corporate,Male,Caucasian,41,2012-05-30,126406.0,10.0,United States,Chicago,NaT
1249,E02242,Emilia Xu,Manager,Marketing,Corporate,Female,Asian,46,2017-01-24,103147.0,6.0,United States,Miami,NaT
1250,E02243,Adam Medina,IT Systems Architect,IT,Manufacturing,Male,Latino,49,2020-10-04,81622.0,0.0,United States,Chicago,NaT
1251,E02244,Connor Howard,Systems Analyst,IT,Specialty Products,Male,Caucasian,42,2021-03-15,44265.0,0.0,United States,Austin,2022-06-27
1252,E02245,Audrey Duong,Vice President,Sales,Specialty Products,Female,Asian,55,1999-03-03,213998.0,34.0,United States,Miami,NaT
1253,E02246,Landon Reyes,Director,IT,Research & Development,Male,Latino,44,2014-08-23,171823.0,27.0,United States,Columbus,NaT
1254,E02247,Noah Ma,Vice President,Accounting,Manufacturing,Male,Asian,28,2018-11-06,201013.0,31.0,United States,Phoenix,NaT
1255,E02248,Lucas Alexander,Director,IT,Manufacturing,Male,Black,41,2013-09-20,192944.0,22.0,United States,Miami,NaT
1256,E02249,Henry Vo,Controls Engineer,Engineering,Manufacturing,Male,Asian,60,1996-09-13,81699.0,0.0,China,Beijing,NaT


In [106]:
df.duplicated().sum()

135

In [110]:
duplicates = df[df.duplicated('Employee ID', keep=False)]

In [108]:
duplicates

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
0,E02002,Kai Le,Controls Engineer,Engineering,Manufacturing,Male,Asian,47,2022-02-05,92368.0,0.0,United States,Columbus,NaT
1,E02003,Robert Patel,Analyst,Sales,Corporate,Male,Asian,58,2013-10-23,45703.0,0.0,United States,Chicago,NaT
2,E02004,Cameron Lo,Network Administrator,IT,Research & Development,Male,Asian,34,2019-03-24,83576.0,0.0,China,Shanghai,NaT
3,E02005,Harper Castillo,IT Systems Architect,IT,Corporate,Female,Latino,39,2018-04-07,98062.0,0.0,United States,Seattle,NaT
4,E02006,Harper Dominguez,Director,Engineering,Corporate,Female,Latino,42,2005-06-18,175391.0,24.0,United States,Austin,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,E02250,Mila Han,Manager,Sales,Manufacturing,Female,Asian,54,2009-11-14,128791.0,6.0,United States,Miami,NaT
1258,E02251,Genesis Herrera,Manager,IT,Research & Development,Female,Latino,34,2015-10-03,126898.0,10.0,Brazil,Manaus,NaT
1259,E02252,Olivia Vazquez,Network Engineer,IT,Specialty Products,Female,Latino,53,2020-04-13,93053.0,0.0,Brazil,Sao Paulo,NaT
1260,E02253,Leilani Ng,Systems Analyst,IT,Corporate,Female,Asian,48,2011-09-19,50513.0,0.0,United States,Seattle,2019-10-30


In [113]:
df[df.duplicated('Employee ID', keep='last')]

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
0,E02002,Kai Le,Controls Engineer,Engineering,Manufacturing,Male,Asian,47,2022-02-05,92368.0,0.0,United States,Columbus,NaT
1,E02003,Robert Patel,Analyst,Sales,Corporate,Male,Asian,58,2013-10-23,45703.0,0.0,United States,Chicago,NaT
2,E02004,Cameron Lo,Network Administrator,IT,Research & Development,Male,Asian,34,2019-03-24,83576.0,0.0,China,Shanghai,NaT
3,E02005,Harper Castillo,IT Systems Architect,IT,Corporate,Female,Latino,39,2018-04-07,98062.0,0.0,United States,Seattle,NaT
4,E02006,Harper Dominguez,Director,Engineering,Corporate,Female,Latino,42,2005-06-18,175391.0,24.0,United States,Austin,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1193,E02022,Eli Dang,Sr. Manager,Accounting,Specialty Products,Male,Asian,45,2015-11-16,122875.0,12.0,,,NaT
1194,E02023,Lillian Lewis,Technical Architect,IT,Research & Development,Female,Black,43,2013-08-14,83323.0,0.0,,,NaT
1195,E02024,Serenity Cao,Account Representative,Sales,Manufacturing,Female,Asian,31,2018-10-21,66721.0,0.0,,,NaT
1196,E02025,Parker Lai,Vice President,Accounting,Specialty Products,Male,Asian,48,2006-11-29,246400.0,36.0,,,NaT


In [118]:
df.drop_duplicates(subset='Employee ID', keep='first', inplace=True)

In [123]:
df['Employee ID'].isnull().sum()

1

In [131]:
df.dropna(subset=['Employee ID'])

Unnamed: 0,Employee ID,Full Name,Job Title,Department,Business Unit,Gender,Ethnicity,Age,Hire Date,Annual Salary,Bonus,Country,City,Exit Date
0,E02002,Kai Le,Controls Engineer,Engineering,Manufacturing,Male,Asian,47,2022-02-05,92368.0,0.0,United States,Columbus,NaT
1,E02003,Robert Patel,Analyst,Sales,Corporate,Male,Asian,58,2013-10-23,45703.0,0.0,United States,Chicago,NaT
2,E02004,Cameron Lo,Network Administrator,IT,Research & Development,Male,Asian,34,2019-03-24,83576.0,0.0,China,Shanghai,NaT
3,E02005,Harper Castillo,IT Systems Architect,IT,Corporate,Female,Latino,39,2018-04-07,98062.0,0.0,United States,Seattle,NaT
4,E02006,Harper Dominguez,Director,Engineering,Corporate,Female,Latino,42,2005-06-18,175391.0,24.0,United States,Austin,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,E02999,Layla Wu,,Sales,Research & Development,Female,Asian,31,2018-08-27,,0.0,United States,,NaT
998,E03000,Thomas Lam,,Engineering,Manufacturing,Male,Asian,52,2022-11-18,,,United States,,NaT
999,E03001,Willow Taylor,,Sales,Corporate,Female,Black,27,2021-02-08,51321.0,,,Seattle,NaT
1016,E02960,Lucas Sandoval,Sr. Analyst,Sales,Manufacturing,Male,Latino,41,2011-05-19,84627.0,0.0,Brazil,Manaus,NaT


In [135]:
df['Employee ID'].duplicated().sum()

0