![image.png](attachment:image.png)

![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [1]:
# Pandas is a data manipulation and analysis tool that uses a data structure known as DataFrame. 
# DataFrames empower programmers to store and manipulate data in a tabular fashion (rows & columns).
# Import Pandas Library into the current environment 
# "pd" is an alias of "Pandas" 
import pandas as pd 

In [2]:
# Pandas is used to read a csv file and store data in a DataFrame
investor_df = pd.read_csv('investors_data.csv')
investor_df

Unnamed: 0,First Name,Last Name,Age,Portfolio Size,Years with Investment Firm,Risk Tolerance,Goal
0,Ryan,David,32,80100.0,5.0,aggressive,buy house
1,Sherif,George,54,950000.0,30.0,conservative,retire
2,Sandra,Stevenson,40,150509.0,10.0,moderate,kids education
3,Victoria,Keller,43,300901.0,,moderate,investment property
4,Sarah,Aly,26,41258.0,2.0,aggressive,pay student loans
5,Bassel,Nasr,50,401201.0,15.0,conservative,retire
6,Chris,Peter,38,,8.0,moderate,kids education
7,Nancy,Smith,55,900000.0,17.0,conservative,retire
8,Heidi,Smith,23,1500.0,1.0,moderate,retire early


In [4]:
# Let's locate rows and columns that have Null values
# isnull() method returns a new DataFrame containing "True" in Null locations and "False" otherwise
investor_df.isnull()

Unnamed: 0,First Name,Last Name,Age,Portfolio Size,Years with Investment Firm,Risk Tolerance,Goal
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,True,False,False,False
7,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False


In [5]:
# Let's see the total number of missing elements per column
investor_df.isnull().sum()

First Name                    0
Last Name                     0
Age                           0
Portfolio Size                1
Years with Investment Firm    1
Risk Tolerance                0
Goal                          0
dtype: int64

In [6]:
# Drop any row that contains a Null value 
# Note that the size of the dataframe has been reduced 
investor_df.dropna(how = 'any', inplace = True)

In [7]:
# Notice that rows 3 and 6 no longer exist!
investor_df

Unnamed: 0,First Name,Last Name,Age,Portfolio Size,Years with Investment Firm,Risk Tolerance,Goal
0,Ryan,David,32,80100.0,5.0,aggressive,buy house
1,Sherif,George,54,950000.0,30.0,conservative,retire
2,Sandra,Stevenson,40,150509.0,10.0,moderate,kids education
4,Sarah,Aly,26,41258.0,2.0,aggressive,pay student loans
5,Bassel,Nasr,50,401201.0,15.0,conservative,retire
7,Nancy,Smith,55,900000.0,17.0,conservative,retire
8,Heidi,Smith,23,1500.0,1.0,moderate,retire early


In [8]:
# Let's check if we still have any missing values
investor_df.isnull().sum()

First Name                    0
Last Name                     0
Age                           0
Portfolio Size                0
Years with Investment Firm    0
Risk Tolerance                0
Goal                          0
dtype: int64

In [9]:
# Let's explore an alternative (smarter) method to deal with missing values
# Let's read the raw data again using Pandas as follows
investor_df = pd.read_csv('investors_data.csv')
investor_df

Unnamed: 0,First Name,Last Name,Age,Portfolio Size,Years with Investment Firm,Risk Tolerance,Goal
0,Ryan,David,32,80100.0,5.0,aggressive,buy house
1,Sherif,George,54,950000.0,30.0,conservative,retire
2,Sandra,Stevenson,40,150509.0,10.0,moderate,kids education
3,Victoria,Keller,43,300901.0,,moderate,investment property
4,Sarah,Aly,26,41258.0,2.0,aggressive,pay student loans
5,Bassel,Nasr,50,401201.0,15.0,conservative,retire
6,Chris,Peter,38,,8.0,moderate,kids education
7,Nancy,Smith,55,900000.0,17.0,conservative,retire
8,Heidi,Smith,23,1500.0,1.0,moderate,retire early


In [10]:
# Let's obtain a statistical summary using "describe()" method
investor_df.describe()

Unnamed: 0,Age,Portfolio Size,Years with Investment Firm
count,9.0,8.0,8.0
mean,40.111111,353183.625,11.0
std,11.634479,377372.061973,9.561829
min,23.0,1500.0,1.0
25%,32.0,70389.5,4.25
50%,40.0,225705.0,9.0
75%,50.0,525900.75,15.5
max,55.0,950000.0,30.0


In [12]:
# Calculate the average portfolio size
investor_df['Portfolio Size'].mean()

353183.625

In [13]:
# You can use .fillna() to fill missing locations with a certain value
investor_df['Portfolio Size'].fillna(investor_df['Portfolio Size'].mean(), inplace = True)

In [14]:
investor_df

Unnamed: 0,First Name,Last Name,Age,Portfolio Size,Years with Investment Firm,Risk Tolerance,Goal
0,Ryan,David,32,80100.0,5.0,aggressive,buy house
1,Sherif,George,54,950000.0,30.0,conservative,retire
2,Sandra,Stevenson,40,150509.0,10.0,moderate,kids education
3,Victoria,Keller,43,300901.0,,moderate,investment property
4,Sarah,Aly,26,41258.0,2.0,aggressive,pay student loans
5,Bassel,Nasr,50,401201.0,15.0,conservative,retire
6,Chris,Peter,38,353183.625,8.0,moderate,kids education
7,Nancy,Smith,55,900000.0,17.0,conservative,retire
8,Heidi,Smith,23,1500.0,1.0,moderate,retire early


**PRACTICE OPPORTUNITY:**
- **Complete the following tasks:**
    - **1. Import the 'investors_data.csv' dataset using Pandas**
    - **2. Calculate the median value for the "Years with Investment Firm" column**
    - **3. Fill in missing values in the "Years with Investment Firm" column with the median values**
    - **4. Perform a sanity check**

# PRACTICE OPPORTUNITY SOLUTION

**PRACTICE OPPORTUNITY SOLUTION:**
- **Complete the following tasks:**
    - **1. Import the 'investors_data.csv' dataset using Pandas**
    - **2. Calculate the median value for the "Years with Investment Firm" column**
    - **3. Fill in missing values in the "Years with Investment Firm" column with the median values**
    - **4. Perform a sanity check**

In [15]:
# Read the data 
import pandas as pd 
investor_df = pd.read_csv('investors_data.csv')
investor_df

Unnamed: 0,First Name,Last Name,Age,Portfolio Size,Years with Investment Firm,Risk Tolerance,Goal
0,Ryan,David,32,80100.0,5.0,aggressive,buy house
1,Sherif,George,54,950000.0,30.0,conservative,retire
2,Sandra,Stevenson,40,150509.0,10.0,moderate,kids education
3,Victoria,Keller,43,300901.0,,moderate,investment property
4,Sarah,Aly,26,41258.0,2.0,aggressive,pay student loans
5,Bassel,Nasr,50,401201.0,15.0,conservative,retire
6,Chris,Peter,38,,8.0,moderate,kids education
7,Nancy,Smith,55,900000.0,17.0,conservative,retire
8,Heidi,Smith,23,1500.0,1.0,moderate,retire early


In [16]:
# Let's locate rows that have Null values
investor_df.isnull().sum()

First Name                    0
Last Name                     0
Age                           0
Portfolio Size                1
Years with Investment Firm    1
Risk Tolerance                0
Goal                          0
dtype: int64

In [18]:
# Calculate the median value for "Years with Investment Firm" column
investor_df['Years with Investment Firm'].median()

9.0

In [19]:
# Use .fillna() method to fill a given column with a certain value
investor_df['Years with Investment Firm'].fillna(investor_df['Years with Investment Firm'].median(), inplace = True)

In [20]:
# Perform a sanity check
investor_df.isnull().sum()

First Name                    0
Last Name                     0
Age                           0
Portfolio Size                1
Years with Investment Firm    0
Risk Tolerance                0
Goal                          0
dtype: int64

In [21]:
investor_df

Unnamed: 0,First Name,Last Name,Age,Portfolio Size,Years with Investment Firm,Risk Tolerance,Goal
0,Ryan,David,32,80100.0,5.0,aggressive,buy house
1,Sherif,George,54,950000.0,30.0,conservative,retire
2,Sandra,Stevenson,40,150509.0,10.0,moderate,kids education
3,Victoria,Keller,43,300901.0,9.0,moderate,investment property
4,Sarah,Aly,26,41258.0,2.0,aggressive,pay student loans
5,Bassel,Nasr,50,401201.0,15.0,conservative,retire
6,Chris,Peter,38,,8.0,moderate,kids education
7,Nancy,Smith,55,900000.0,17.0,conservative,retire
8,Heidi,Smith,23,1500.0,1.0,moderate,retire early


# EXCELLENT JOB!