#### Data Manipulation and Analysis with Pandas
Data manipulation and analysis are key tasks in any data science or data analysis project. Pandas provides a wide range of functions for data manipulation and analysis, making it easier to clean, transform, and extract insights from data. In this lesson, we will cover various data manipulation and analysis techniques using Pandas.

In [1]:
import pandas as pd
df = pd.read_csv("data.csv")

In [2]:
df.head()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,01-01-2023,A,28.0,Product1,754.0,East
1,02-01-2023,B,39.0,Product3,110.0,North
2,03-01-2023,C,32.0,Product2,398.0,East
3,04-01-2023,B,8.0,Product1,522.0,East
4,05-01-2023,B,26.0,Product3,869.0,North


In [3]:
df.tail(2)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
48,18-02-2023,C,65.0,Product3,182.0,North
49,19-02-2023,C,11.0,Product3,708.0,North


In [4]:
df.describe().T # Only the numerical columns

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Value,47.0,51.744681,29.050532,2.0,27.5,54.0,70.0,99.0
Sales,46.0,557.130435,274.598584,108.0,339.0,591.5,767.5,992.0


In [7]:
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

In [8]:
df.isna()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [9]:
df.isna().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [10]:
df_filled = df.fillna(0) # filling missing values with 0 

In [12]:
df_filled.isna().sum()

Date        0
Category    0
Value       0
Product     0
Sales       0
Region      0
dtype: int64

In [13]:
df['Sale_na'] = df['Sales'].fillna(df['Sales'].mean()) # fill with mean but new column created 

In [14]:
df.isna().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
Sale_na     0
dtype: int64

In [20]:
# Renaming the column 
df=df.rename(columns={'Date':'Sales Date'})
df.head(2)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sale_na
0,01-01-2023,A,28.0,Product1,754.0,East,754.0
1,02-01-2023,B,39.0,Product3,110.0,North,110.0


In [25]:
# Change the Data type 
df['Values_new'] = df['Value'].fillna(df['Value'].mean()).astype(int)

df.head(2)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sale_na,Values_new
0,01-01-2023,A,28.0,Product1,754.0,East,754.0,28
1,02-01-2023,B,39.0,Product3,110.0,North,110.0,39


In [26]:
df['New Value'] = df['Value'].apply(lambda x: x * 2)
df.head(2)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sale_na,Values_new,New Value
0,01-01-2023,A,28.0,Product1,754.0,East,754.0,28,56.0
1,02-01-2023,B,39.0,Product3,110.0,North,110.0,39,78.0


In [29]:
# Data Aggreating and groupping 
grouped = df.groupby('Product')['Value'].mean()

print(grouped)

Product
Product1    46.214286
Product2    52.800000
Product3    55.166667
Name: Value, dtype: float64


In [33]:
grouped_sum = df.groupby(['Product','Region'])['Value'].sum()
print(grouped_sum)

Product   Region
Product1  East      292.0
          North       9.0
          South     100.0
          West      246.0
Product2  East       56.0
          North     127.0
          South     181.0
          West      428.0
Product3  East      202.0
          North     203.0
          South     215.0
          West      373.0
Name: Value, dtype: float64


In [34]:
## Multiple aggreate function 
grouped_agg = df.groupby('Region')['Value'].agg(['sum', 'mean', 'max', 'min'])

print(grouped_agg)

           sum       mean   max   min
Region                               
East     550.0  42.307692  97.0   8.0
North    339.0  37.666667  71.0   2.0
South    496.0  62.000000  94.0   6.0
West    1047.0  61.588235  99.0  17.0


In [35]:
# Merging and joining DataFrames 
# create some sample Data frames 
df1 = pd.DataFrame({'Key': ['A', 'B', 'C', 'D'], 'Value1': [1, 2, 3, 4]})
df2 = pd.DataFrame({'Key': ['B', 'D', 'E', 'F'], 'Value2': [5, 6, 7, 8]})

In [36]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3
3,D,4


In [37]:
df2

Unnamed: 0,Key,Value2
0,B,5
1,D,6
2,E,7
3,F,8


In [38]:
## Merge on the key columns 
pd.merge(df1, df2, on='Key', how='inner')  # matching both columns 

Unnamed: 0,Key,Value1,Value2
0,B,2,5
1,D,4,6


In [39]:
pd.merge(df1, df2, on='Key', how='outer') # Both Dataframe consider if no value then its come NaN 

Unnamed: 0,Key,Value1,Value2
0,A,1.0,
1,B,2.0,5.0
2,C,3.0,
3,D,4.0,6.0
4,E,,7.0
5,F,,8.0


In [40]:
pd.merge(df1, df2, on='Key', how='left') #  

Unnamed: 0,Key,Value1,Value2
0,A,1,
1,B,2,5.0
2,C,3,
3,D,4,6.0


In [42]:
pd.merge(df1, df2, on='Key', how='right') # Most import 

Unnamed: 0,Key,Value1,Value2
0,B,2.0,5
1,D,4.0,6
2,E,,7
3,F,,8
