## 1.	Import the attached Netflix csv file in Jupyter notebook and perform following operations using Pandas:

In [None]:
import pandas as pd
df=pd.read_csv("netflix.csv")

a) Print the first 5 rows and last 5 rows of the dataframe

In [None]:
df.head(5)

In [None]:
df.tail(5)

b)	Check how many rows and columns are there using Pandas function.

In [None]:
df.shape

c)	Print all the column names.

In [None]:
df.columns

d)	Calculate the descriptive statistics of all the variables(integer/float/object etc).

In [None]:
df.describe(include="all")

e)	Check the number of unique values for each column.

In [None]:
df.nunique()

f)	Check the percentage of missing values for each column.

In [None]:
(df.isnull().sum()/len(df))*100

g)	Delete all the rows where Director column has missing values.

In [None]:
df.dropna(subset=["director"], inplace=True)

h)	Print all the records where country has Germany value (including West Germany). If any other country is there along with Germany, then that row should also come in output.

In [None]:
df[df["country"].str.contains("germany",case=False,na=False)]

i)	Expand Duration column into 2 separate columns – First column having the numeric value and other having String. Eg: 3 seasons should be split in 2 columns having 3 in 1st column and seasons in 2nd column.

In [None]:
df[["duration_num","duration_text"]]=df["duration"].str.split(" ", n=1,expand=True)

j)	Split Date added into 3 separate columns having date value in 1st column, month value in 2nd column and year value in 3rd.

In [None]:
df["date_added"]=pd.to_datetime(df["date_added"],errors="coerce")
df["date_day"]=df["date_added"].dt.day
df["date_month"]=df["date_added"].dt.month
df["date_year"]=df["date_added"].dt.year

k)	Print the number of TV shows/Movies released in each year.

In [None]:
df.groupby(["release_year","type"]).size().reset_index(name="count")

l)	Rename the column title with movie_title.

In [None]:
df.rename(columns={"title":"movie_title"},inplace=True)

m)	Split Listed_in column into 3 different columns with col name (Genre1, Genre2, Genre3). Split the column based on comma.

In [None]:
df[["Genre1","Genre2","Genre3"]]=df["listed_in"].str.split(",",expand=True)

## 2.	Import both the attached files (student.csv and mark.csv) in Jupyter notebook and perform following operations:

In [None]:
import pandas as pd
df_students=pd.read_csv("student.csv")
df_marks=pd.read_csv("mark.csv")

a)	Combine both the dataframes into single dataframe which will have all the records from both the tables.

In [None]:
df=pd.merge(df_students,df_marks,on="Student_id",how="outer")
df

b)	Print the maximum and minimum marks Gender wise.

In [None]:
df.groupby(["Gender"])[["Mark"]].agg([min,max])

c)	Print all the students IDs and their marks who have scored more than the average marks of the class.

In [None]:
df[df["Mark"]>df["Mark"].mean()][["Student_id","Mark"]]

d)	Print the dataframe who are Males and are Employed.

In [None]:
df[(df["Gender"]=="Male") & (df["Employed"]=="no")]

e)	Create a new Column ‘IQ_level’ which will have 3 values (Intelligent, Mediocre, weak). If student scored than 80 then Tag him as Intelligent, if student scored between 50-80, then Mediocre, else Weak.

In [None]:
def IQ_level(score):
    if score>=80:
        return "Intelligent"
    elif 80>score>50:
        return "Mediocre"
    else:
        return "Weak"

df["IQ_level"]=df["Mark"].apply(IQ_level)

f)	Count the number of males and females from each city.

In [None]:
df.groupby(["City","Gender"])["Student_id"].count()

g)	Print the top 5 Male scorers.

In [None]:
df[df["Gender"]=="Male"].nlargest(5,"Mark")

h)	Replace the Male value with M and Female value with F and export this dataframe to excel file in D: (D drive) and name the file as test.csv.

In [None]:
df["Gender"]=df["Gender"].map({"Male":"M","Female":"F"})
df.to_csv(r"D:\test.csv",index=False)

i)	Check if any student_ID is duplicated.

In [None]:
df["Student_id"].duplicated().any()

j)	Create a separate dataframe which will have all the Integer/Float variables.

In [None]:
df_numeric=df.select_dtypes(include=["float64","int64"])
df_numeric

k)	Get those Student_IDs which are present in Students table but not in Marks table.

In [None]:
new_df=pd.merge(df_students,df_marks,on="Student_id",how="left")
new_df[new_df["Mark"].isnull()]

## 3.	Explain the concept of missing values. How can you identify the missing values in a Pandas DataFrame ? What are the different ways of treating/Imputing/Deleting the missing values.Explain with example.


In [None]:
# ---------------------------------------------
# Missing Values in Pandas – Complete Example
# ---------------------------------------------

import pandas as pd
import numpy as np

# ---------------------------
# 1️. Create Sample DataFrame
# ---------------------------
df = pd.DataFrame({
    'Student_id': [1, 2, 3, 4, 5],
    'Name': ['Jay', 'Eric', 'Amit', np.nan, 'Saranya'],
    'Marks': [85, np.nan, 75, 90, np.nan],
    'City': ['Delhi', 'Mumbai', None, 'Pune', 'Hyderabad']
})

print("Original DataFrame:\n", df, "\n")

# ------------------------------------------
# 2️. Identify Missing Values
# ------------------------------------------
print("Check missing values using isnull():\n", df.isnull(), "\n")
print("Count of missing values in each column:\n", df.isnull().sum(), "\n")
print("Non-null info summary:\n")
df.info()
print("\n")

# ------------------------------------------
# 3️. Handle Missing Values – Deletion
# ------------------------------------------

# Drop rows with any missing values
df_drop_rows = df.dropna()
print("After dropping rows with missing values:\n", df_drop_rows, "\n")

# Drop columns with missing values
df_drop_cols = df.dropna(axis=1)
print("After dropping columns with missing values:\n", df_drop_cols, "\n")

# Drop rows only if all values are missing
df_drop_all = df.dropna(how='all')
print("After dropping rows where all values are missing:\n", df_drop_all, "\n")

# ------------------------------------------
# 4️. Handle Missing Values – Imputation
# ------------------------------------------

# Fill with constant value
df_const = df.copy()
df_const['City'].fillna('Unknown', inplace=True)
print("After filling City with constant value:\n", df_const, "\n")

# Fill with mean, median, and mode
df_mean = df.copy()
df_mean['Marks'].fillna(df_mean['Marks'].mean(), inplace=True)
print("After filling Marks with mean:\n", df_mean, "\n")

df_median = df.copy()
df_median['Marks'].fillna(df_median['Marks'].median(), inplace=True)
print("After filling Marks with median:\n", df_median, "\n")

df_mode = df.copy()
df_mode['City'].fillna(df_mode['City'].mode()[0], inplace=True)
print("After filling City with mode:\n", df_mode, "\n")

# Forward Fill (previous value)
df_ffill = df.fillna(method='ffill')
print("After forward fill:\n", df_ffill, "\n")

# Backward Fill (next value)
df_bfill = df.fillna(method='bfill')
print("After backward fill:\n", df_bfill, "\n")

# ------------------------------------------
# 5️. Summary of Methods
# ------------------------------------------
"""
Identification:
    df.isnull(), df.notnull(), df.isnull().sum(), df.info()

Deletion:
    df.dropna(), df.dropna(axis=1), df.dropna(how='all'), df.dropna(subset=['col'])

Imputation:
    df.fillna(value)
    df['col'].fillna(df['col'].mean()/median()/mode()[0])
    df.fillna(method='ffill' or 'bfill')
"""

print(" All missing value handling methods demonstrated successfully.")