**Import required libraries**

In [1]:
import pandas as pd
import numpy as np

**Connect Drive to Google.colab**

In [2]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


**Import data (.csv form)**

In [3]:
path = "/drive/MyDrive/Data_Processing_lab/Dataset/4_movie_scores.csv"

In [4]:
df=pd.read_csv(path)

**Dataframe**

In [5]:
df

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,Root,Joss,36.0,m,8.0,9.0
1,,,,,,
2,Stark,Mike,48.0,m,,
3,Sofie,Miller,39.0,f,7.0,8.0
4,Emma,Roy,84.0,f,6.0,8.0


**Checking and Selecting Null Values**

In [6]:
df.isnull()                                       # null position is indicated by boolean operator "True"

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [7]:
df.notnull()                                         # Non-null position is indicated by boolean operator "True"

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,True,True,True,True,True,True
1,False,False,False,False,False,False
2,True,True,True,True,False,False
3,True,True,True,True,True,True
4,True,True,True,True,True,True


**non-null values from perticular raw or column**

In [8]:
df['first_name']

0     Root
1      NaN
2    Stark
3    Sofie
4     Emma
Name: first_name, dtype: object

In [9]:
df[df['first_name'].notnull()]

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,Root,Joss,36.0,m,8.0,9.0
2,Stark,Mike,48.0,m,,
3,Sofie,Miller,39.0,f,7.0,8.0
4,Emma,Roy,84.0,f,6.0,8.0


**non-null and null values from multiple raw or column**

In [None]:
df[(df['pre_movie_score'].isnull()) & df['Gender'].notnull()]

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
2,Stark,Mike,48.0,m,,


**How to drop or replace null values?**

**Actual Dataset**

In [10]:
df

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,Root,Joss,36.0,m,8.0,9.0
1,,,,,,
2,Stark,Mike,48.0,m,,
3,Sofie,Miller,39.0,f,7.0,8.0
4,Emma,Roy,84.0,f,6.0,8.0


In [None]:
help(df.dropna)

Help on method dropna in module pandas.core.frame:

dropna(axis: 'Axis' = 0, how: 'str' = 'any', thresh=None, subset=None, inplace: 'bool' = False) method of pandas.core.frame.DataFrame instance
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        .. versionchanged:: 1.0.0
    
           Pass tuple or list to drop on multiple axes.
           Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA or all NA.
    
        * 'any' : If a

**count of missing values**

**Missing value count along Columns**

In [11]:
df.isna().sum()

first_name          1
last_name           1
age                 1
Gender              1
pre_movie_score     2
post_movie_score    2
dtype: int64

**Missing value count of complete dataframe** 

In [12]:
df.isna().sum().sum()

8

**Drop rows contains null values**

In [14]:
df.dropna(axis=1)

0
1
2
3
4


**keep rows contains at least one non-null value**

In [15]:
df.dropna(thresh=1)

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,Root,Joss,36.0,m,8.0,9.0
2,Stark,Mike,48.0,m,,
3,Sofie,Miller,39.0,f,7.0,8.0
4,Emma,Roy,84.0,f,6.0,8.0


**Drop columns contains null values**

In [None]:
df.dropna(axis=1)

0
1
2
3
4


**keep columns contains at least four non-null values**

In [16]:
df.dropna(thresh=4,axis=1)

Unnamed: 0,first_name,last_name,age,Gender
0,Root,Joss,36.0,m
1,,,,
2,Stark,Mike,48.0,m
3,Sofie,Miller,39.0,f
4,Emma,Roy,84.0,f


**Fill null value Data**

In [17]:
df.fillna("NEW")

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,Root,Joss,36.0,m,8.0,9.0
1,NEW,NEW,NEW,NEW,NEW,NEW
2,Stark,Mike,48.0,m,NEW,NEW
3,Sofie,Miller,39.0,f,7.0,8.0
4,Emma,Roy,84.0,f,6.0,8.0


In [24]:
df['age'].fillna(5)

0    36.0
1     5.0
2    48.0
3    39.0
4    84.0
Name: age, dtype: float64

In [19]:
df['first_name'] = df['first_name'].fillna("Empty")

In [25]:
df

Unnamed: 0,first_name,last_name,age,Gender,pre_movie_score,post_movie_score
0,Root,Joss,36.0,m,8.0,9.0
1,Empty,,,,,
2,Stark,Mike,48.0,m,,
3,Sofie,Miller,39.0,f,7.0,8.0
4,Emma,Roy,84.0,f,6.0,8.0


**Fill 'pre_movie_score' with mean value of 'pre_movie_score'**

**Find mean value of 'pre_movie_score' column**

In [26]:
df['pre_movie_score'].mean()

7.0

In [27]:
df['pre_movie_score'].fillna(df['pre_movie_score'].mean())

0    8.0
1    7.0
2    7.0
3    7.0
4    6.0
Name: pre_movie_score, dtype: float64

**Replace missing values with mean and median**

In [None]:
path2= "/drive/MyDrive/Data_Processing_lab/Dataset/Students_expenses.csv"

In [None]:
dz=pd.read_csv(path2)
dz

Unnamed: 0,Expenses,Riya,Samual,Neeti,Shreya
0,January,8100,6200.0,9200.0,8150.0
1,February,9500,7500.0,,7200.0
2,March,7300,6100.0,8800.0,8100.0
3,April,7800,7200.0,8900.0,7500.0
4,May,8500,,9100.0,7800.0
5,June,9200,7100.0,8850.0,


In [None]:
dz.mean()

  """Entry point for launching an IPython kernel.


Riya      8400.0
Samual    6820.0
Neeti     8970.0
Shreya    7750.0
dtype: float64

In [None]:
dz.fillna(dz.mean())

  """Entry point for launching an IPython kernel.


Unnamed: 0,Expenses,Riya,Samual,Neeti,Shreya
0,January,8100,6200.0,9200.0,8150.0
1,February,9500,7500.0,8970.0,7200.0
2,March,7300,6100.0,8800.0,8100.0
3,April,7800,7200.0,8900.0,7500.0
4,May,8500,6820.0,9100.0,7800.0
5,June,9200,7100.0,8850.0,7750.0


In [None]:
dz

Unnamed: 0,Expenses,Riya,Samual,Neeti,Shreya
0,January,8100,6200.0,9200.0,8150.0
1,February,9500,7500.0,,7200.0
2,March,7300,6100.0,8800.0,8100.0
3,April,7800,7200.0,8900.0,7500.0
4,May,8500,,9100.0,7800.0
5,June,9200,7100.0,8850.0,


In [None]:
dz.median()

  """Entry point for launching an IPython kernel.


Riya      8300.0
Samual    7100.0
Neeti     8900.0
Shreya    7800.0
dtype: float64

In [None]:
dz.fillna(dz.median())

  """Entry point for launching an IPython kernel.


Unnamed: 0,Expenses,Riya,Samual,Neeti,Shreya
0,January,8100,6200.0,9200.0,8150.0
1,February,9500,7500.0,8900.0,7200.0
2,March,7300,6100.0,8800.0,8100.0
3,April,7800,7200.0,8900.0,7500.0
4,May,8500,7100.0,9100.0,7800.0
5,June,9200,7100.0,8850.0,7800.0
