In [1]:
import pandas as pd

data = {
    'Name':['Alice','Bob','Charlie','David'],
    'Age':[25,30,35,40],
    'Salary':[50000,60000,70000,80000],
    'Gender':['Female','Male','Male','Male']
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age,Salary,Gender
0,Alice,25,50000,Female
1,Bob,30,60000,Male
2,Charlie,35,70000,Male
3,David,40,80000,Male


In [2]:
df.dtypes

Unnamed: 0,0
Name,object
Age,int64
Salary,int64
Gender,object


In [4]:
#Data Extraction
df['Age']
df.iloc[0:2]

Unnamed: 0,Name,Age,Salary,Gender
0,Alice,25,50000,Female
1,Bob,30,60000,Male


Data Wrangling - The process of transforming the messy data into usable format

1. data Acquisition - Loading data from various source

In [5]:
#csv , excel , API , databases
#df = pd.read_csv('')
#url = 'https://vcgdvgcvdgvcgdv.csv'

2. Data Inspection - Inspecting the data to give some insights into the structure and completeness

In [9]:
df.head()
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   Salary  4 non-null      int64 
 3   Gender  4 non-null      object
dtypes: int64(2), object(2)
memory usage: 260.0+ bytes


3. Data Cleaning - Removing or correcting the missing values or duplicate values

In [11]:
#identify any null values
df.isnull().sum()

#This is to drop the duplicates
df_cleaned = df.drop_duplicates()
df_cleaned

Unnamed: 0,Name,Age,Salary,Gender
0,Alice,25,50000,Female
1,Bob,30,60000,Male
2,Charlie,35,70000,Male
3,David,40,80000,Male


4. Data Transformation - Normalization, scaling, encoding and type conversion

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['Age_Scaled'] = scaler.fit_transform(df[['Age']])
df

Unnamed: 0,Name,Age,Salary,Gender,Age_Scaled
0,Alice,25,50000,Female,0.0
1,Bob,30,60000,Male,0.333333
2,Charlie,35,70000,Male,0.666667
3,David,40,80000,Male,1.0


5. Data Integration - combining data from different source

In [15]:
extra = pd.DataFrame({
    'Name':['Alice','Bob'],
    'Department': ['HR','Finance']
})

df_merged = pd.merge(df , extra , on='Name', how='left')
df_merged

Unnamed: 0,Name,Age,Salary,Gender,Age_Scaled,Department
0,Alice,25,50000,Female,0.0,HR
1,Bob,30,60000,Male,0.333333,Finance
2,Charlie,35,70000,Male,0.666667,
3,David,40,80000,Male,1.0,


6. Data Validation and Quality --> data is consistent, correct

In [17]:
df[df['Age'] < 0]

Unnamed: 0,Name,Age,Salary,Gender,Age_Scaled


7. Documentation and reporting - Describe your dataset, transformations and results clearly

In [18]:
report = df.describe(include='all')
report

Unnamed: 0,Name,Age,Salary,Gender,Age_Scaled
count,4,4.0,4.0,4,4.0
unique,4,,,2,
top,Alice,,,Male,
freq,1,,,3,
mean,,32.5,65000.0,,0.5
std,,6.454972,12909.944487,,0.430331
min,,25.0,50000.0,,0.0
25%,,28.75,57500.0,,0.25
50%,,32.5,65000.0,,0.5
75%,,36.25,72500.0,,0.75
