# **Drop Data**

In [1]:
import pandas as pd

In [2]:
sales_data = {
    'Date': pd.to_datetime([
        '2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-03'
    ]),
    'Product': ['Widget A', 'Widget B', 'Widget A', 'Widget B', 'Widget A'],
    'Region': ['North', 'North', 'South', 'South', 'East'],
    'Units_Sold': [100, 120, 80, None, 90],
    'Revenue': [5000.00, 7200.50, 4000.00, 9000.00, 4500.25]
}

df_sales = pd.DataFrame(sales_data)
print(df_sales)

        Date   Product Region  Units_Sold  Revenue
0 2023-01-01  Widget A  North       100.0  5000.00
1 2023-01-01  Widget B  North       120.0  7200.50
2 2023-01-02  Widget A  South        80.0  4000.00
3 2023-01-02  Widget B  South         NaN  9000.00
4 2023-01-03  Widget A   East        90.0  4500.25


### Drop Rows/Columns
Removing specified rows or columns

In [3]:
# Drop Date Column
df_sales = df_sales.drop(columns=['Date'])
print(df_sales)

    Product Region  Units_Sold  Revenue
0  Widget A  North       100.0  5000.00
1  Widget B  North       120.0  7200.50
2  Widget A  South        80.0  4000.00
3  Widget B  South         NaN  9000.00
4  Widget A   East        90.0  4500.25


In [4]:
# Add Date Column At last
df_sales["Date"] = pd.to_datetime([
        '2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-03'
    ])
print(df_sales)

    Product Region  Units_Sold  Revenue       Date
0  Widget A  North       100.0  5000.00 2023-01-01
1  Widget B  North       120.0  7200.50 2023-01-01
2  Widget A  South        80.0  4000.00 2023-01-02
3  Widget B  South         NaN  9000.00 2023-01-02
4  Widget A   East        90.0  4500.25 2023-01-03


### Drop Rows/Columns
Removing specified rows or columns

In [5]:
# Drop Units_Sold
df_sales = df_sales.drop(columns=['Units_Sold'])
print(df_sales)

    Product Region  Revenue       Date
0  Widget A  North  5000.00 2023-01-01
1  Widget B  North  7200.50 2023-01-01
2  Widget A  South  4000.00 2023-01-02
3  Widget B  South  9000.00 2023-01-02
4  Widget A   East  4500.25 2023-01-03


# **Handle Missing Data**

In [16]:
missing_data = "/content/drive/MyDrive/Pandas for Data Analysis/Day_05/customer_data.csv"
df_customer = pd.read_csv(missing_data)
print(df_customer)

   CustomerID           Name  Age         City  TotalSpend
0         101    Alice Smith   29     New York     1200.50
1         102      Bob Jones  NaN  Los Angeles         NaN
2         103  Charlie Brown   34          NaN      500.00
3         104            NaN   22      Chicago      300.25
4         105   David Wilson   45      Houston      850.00
5         106      Eve Davis  NaN      Phoenix      125.00
6         107   Frank Miller   30     New York         NaN
7         108      Grace Lee  NaN      Seattle     2100.75
8         109    Henry White    ?       Boston         NaN
9         110      Ivy Green   28          NaN      950.50


### **Detect Missing Data**

In [17]:
print(df_customer.isnull())

   CustomerID   Name    Age   City  TotalSpend
0       False  False  False  False       False
1       False  False   True  False        True
2       False  False  False   True       False
3       False   True  False  False       False
4       False  False  False  False       False
5       False  False   True  False       False
6       False  False  False  False        True
7       False  False   True  False       False
8       False  False  False  False        True
9       False  False  False   True       False


### Check Missing Values
Identifying null/missing data

In [18]:
print(df_customer.isnull().sum())

CustomerID    0
Name          1
Age           3
City          2
TotalSpend    3
dtype: int64


### **Drop Missing Values using `dropna()` method**
Not Good Approach

In [19]:
df_customer.dropna(inplace=True)
print(df_customer)

   CustomerID          Name Age      City  TotalSpend
0         101   Alice Smith  29  New York      1200.5
4         105  David Wilson  45   Houston       850.0


### **Fill the missing values**

In [20]:
customer_data = "/content/drive/MyDrive/Pandas for Data Analysis/Day_05/customer_data.csv"
df_Customer = pd.read_csv(customer_data)
print(df_Customer)

   CustomerID           Name  Age         City  TotalSpend
0         101    Alice Smith   29     New York     1200.50
1         102      Bob Jones  NaN  Los Angeles         NaN
2         103  Charlie Brown   34          NaN      500.00
3         104            NaN   22      Chicago      300.25
4         105   David Wilson   45      Houston      850.00
5         106      Eve Davis  NaN      Phoenix      125.00
6         107   Frank Miller   30     New York         NaN
7         108      Grace Lee  NaN      Seattle     2100.75
8         109    Henry White    ?       Boston         NaN
9         110      Ivy Green   28          NaN      950.50


**Fill with default value**

### Fill Missing Values
Handling missing data with fill values

In [21]:
default_value = 0
df_Customer.fillna(default_value, inplace=True)
print(df_Customer)

   CustomerID           Name Age         City  TotalSpend
0         101    Alice Smith  29     New York     1200.50
1         102      Bob Jones   0  Los Angeles        0.00
2         103  Charlie Brown  34            0      500.00
3         104              0  22      Chicago      300.25
4         105   David Wilson  45      Houston      850.00
5         106      Eve Davis   0      Phoenix      125.00
6         107   Frank Miller  30     New York        0.00
7         108      Grace Lee   0      Seattle     2100.75
8         109    Henry White   ?       Boston        0.00
9         110      Ivy Green  28            0      950.50


### Data Info\n
Viewing DataFrame structure and data types

In [24]:
df_Customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CustomerID  10 non-null     int64  
 1   Name        10 non-null     object 
 2   Age         10 non-null     object 
 3   City        10 non-null     object 
 4   TotalSpend  10 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 532.0+ bytes


In [27]:
# Convert Age Column DataType object to int
df_Customer['Age'] = df_Customer['Age'].replace('?', 0)
df_Customer['Age'] = df_Customer['Age'].astype(int)
print(df_Customer)

   CustomerID           Name  Age         City  TotalSpend
0         101    Alice Smith   29     New York     1200.50
1         102      Bob Jones    0  Los Angeles        0.00
2         103  Charlie Brown   34            0      500.00
3         104              0   22      Chicago      300.25
4         105   David Wilson   45      Houston      850.00
5         106      Eve Davis    0      Phoenix      125.00
6         107   Frank Miller   30     New York        0.00
7         108      Grace Lee    0      Seattle     2100.75
8         109    Henry White    0       Boston        0.00
9         110      Ivy Green   28            0      950.50


### Data Info\n
Viewing DataFrame structure and data types

In [28]:
df_Customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CustomerID  10 non-null     int64  
 1   Name        10 non-null     object 
 2   Age         10 non-null     int64  
 3   City        10 non-null     object 
 4   TotalSpend  10 non-null     float64
dtypes: float64(1), int64(2), object(2)
memory usage: 532.0+ bytes


In [37]:
# Fill Values
mean_of_age = df_Customer['Age'].mean()
df_Customer['Age'].replace(0, mean_of_age, inplace=True)

In [38]:
print(df_Customer)

   CustomerID           Name   Age         City  TotalSpend
0         101    Alice Smith  29.0     New York     1200.50
1         102      Bob Jones  18.8  Los Angeles        0.00
2         103  Charlie Brown  34.0            0      500.00
3         104              0  22.0      Chicago      300.25
4         105   David Wilson  45.0      Houston      850.00
5         106      Eve Davis  18.8      Phoenix      125.00
6         107   Frank Miller  30.0     New York        0.00
7         108      Grace Lee  18.8      Seattle     2100.75
8         109    Henry White  18.8       Boston        0.00
9         110      Ivy Green  28.0            0      950.50


Add Name in Name Column

### Access Rows
Selecting rows using loc/iloc

In [42]:
df_Customer.loc[3, "Name"] = "Noman"
print(df_Customer)

   CustomerID           Name   Age         City  TotalSpend
0         101    Alice Smith  29.0     New York     1200.50
1         102      Bob Jones  18.8  Los Angeles        0.00
2         103  Charlie Brown  34.0            0      500.00
3         104          Noman  22.0      Chicago      300.25
4         105   David Wilson  45.0      Houston      850.00
5         106      Eve Davis  18.8      Phoenix      125.00
6         107   Frank Miller  30.0     New York        0.00
7         108      Grace Lee  18.8      Seattle     2100.75
8         109    Henry White  18.8       Boston        0.00
9         110      Ivy Green  28.0            0      950.50


Add City Name

### Access Rows
Selecting rows using loc/iloc

In [43]:
df_Customer.loc[2, "City"] = "Karachi"
df_Customer.loc[9, "City"] = "Lahore"
print(df_Customer)

   CustomerID           Name   Age         City  TotalSpend
0         101    Alice Smith  29.0     New York     1200.50
1         102      Bob Jones  18.8  Los Angeles        0.00
2         103  Charlie Brown  34.0      Karachi      500.00
3         104          Noman  22.0      Chicago      300.25
4         105   David Wilson  45.0      Houston      850.00
5         106      Eve Davis  18.8      Phoenix      125.00
6         107   Frank Miller  30.0     New York        0.00
7         108      Grace Lee  18.8      Seattle     2100.75
8         109    Henry White  18.8       Boston        0.00
9         110      Ivy Green  28.0       Lahore      950.50


Handle `TotalSpend` Column

In [44]:
mean_of_total_spend = df_Customer['TotalSpend'].mean()
df_Customer['TotalSpend'].replace(0, mean_of_total_spend, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_Customer['TotalSpend'].replace(0, mean_of_total_spend, inplace=True)


In [45]:
print(df_Customer)

   CustomerID           Name   Age         City  TotalSpend
0         101    Alice Smith  29.0     New York     1200.50
1         102      Bob Jones  18.8  Los Angeles      602.70
2         103  Charlie Brown  34.0      Karachi      500.00
3         104          Noman  22.0      Chicago      300.25
4         105   David Wilson  45.0      Houston      850.00
5         106      Eve Davis  18.8      Phoenix      125.00
6         107   Frank Miller  30.0     New York      602.70
7         108      Grace Lee  18.8      Seattle     2100.75
8         109    Henry White  18.8       Boston      602.70
9         110      Ivy Green  28.0       Lahore      950.50
