In [1]:
import numpy as np

In [2]:
import pandas as pd

In [5]:
data = pd.read_csv("students.csv")

In [6]:
print("Initial Data:")
print(data.head())

Initial Data:
  Student_ID       Name  Age  Gender   Location Enrolment_Date
0       S001  Student_1   24  Female    Kolkata     24/11/2023
1       S002  Student_2   32   Other    Chennai     27/02/2023
2       S003  Student_3   28   Other     Mumbai     13/01/2023
3       S004  Student_4   25  Female  Bangalore     21/05/2023
4       S005  Student_5   24   Other      Delhi     06/05/2023


In [7]:
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
Student_ID        0
Name              0
Age               0
Gender            0
Location          0
Enrolment_Date    0
dtype: int64


In [8]:
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")



Number of duplicate rows: 0


In [9]:
data['Enrolment_Date'] = pd.to_datetime(data['Enrolment_Date'], format='%d/%m/%Y')

In [10]:
data.head()

Unnamed: 0,Student_ID,Name,Age,Gender,Location,Enrolment_Date
0,S001,Student_1,24,Female,Kolkata,2023-11-24
1,S002,Student_2,32,Other,Chennai,2023-02-27
2,S003,Student_3,28,Other,Mumbai,2023-01-13
3,S004,Student_4,25,Female,Bangalore,2023-05-21
4,S005,Student_5,24,Other,Delhi,2023-05-06


In [11]:
data['Age'] = data['Age'].astype(int)

In [12]:
data['Gender'] = data['Gender'].str.strip().str.capitalize()

In [13]:
data['Location'] = data['Location'].str.strip().str.capitalize()

In [14]:
print("\nCleaned Data:")
print(data.head())


Cleaned Data:
  Student_ID       Name  Age  Gender   Location Enrolment_Date
0       S001  Student_1   24  Female    Kolkata     2023-11-24
1       S002  Student_2   32   Other    Chennai     2023-02-27
2       S003  Student_3   28   Other     Mumbai     2023-01-13
3       S004  Student_4   25  Female  Bangalore     2023-05-21
4       S005  Student_5   24   Other      Delhi     2023-05-06


In [16]:
from google.colab import files
data.to_csv("cleaned_students_dataset.csv", index=False)

files.download("cleaned_students_dataset.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
data1 = pd.read_csv("course_activity.csv")

In [28]:
print("Initial Data:")
print(data1.head())

Initial Data:
  Student_ID Course_ID        Date  Time_Spent_Minutes  Completion_Percentage
0       S001     PY202  05/01/2024                  90                  46.10
1       S001     DM101  28/01/2024                 155                  88.87
2       S001     UX303  28/01/2024                 177                  54.93
3       S002     PY202  03/02/2024                  45                  32.20
4       S002     UX303  15/03/2024                 119                  90.80


In [29]:
print("\nMissing Values:")
print(data1.isnull().sum())


Missing Values:
Student_ID               0
Course_ID                0
Date                     0
Time_Spent_Minutes       0
Completion_Percentage    0
dtype: int64


In [30]:
duplicates = data1.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 0


In [31]:
data1['Date'] = pd.to_datetime(data1['Date'], format='%d/%m/%Y')

In [32]:
data1['Time_Spent_Minutes'] = data1['Time_Spent_Minutes'].astype(int)

In [33]:
data1['Completion_Percentage'] = data1['Completion_Percentage'].astype(float)

In [34]:
data1 = data1[(data1['Completion_Percentage'] >= 0) & (data1['Completion_Percentage'] <= 100)]

In [35]:
print("\nCleaned Data:")
print(data1.head())


Cleaned Data:
  Student_ID Course_ID       Date  Time_Spent_Minutes  Completion_Percentage
0       S001     PY202 2024-01-05                  90                  46.10
1       S001     DM101 2024-01-28                 155                  88.87
2       S001     UX303 2024-01-28                 177                  54.93
3       S002     PY202 2024-02-03                  45                  32.20
4       S002     UX303 2024-03-15                 119                  90.80


In [37]:
from google.colab import files
data1.to_csv("cleaned_course_activity.csv", index=False)

files.download("cleaned_course_activity.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
data2 = pd.read_csv("feedback.csv")

In [14]:
print("Initial Data:")
print(data2.head())

Initial Data:
  Student_ID Course_ID  Rating       Feedback_Text
0       S057     UX303       2      Too fast-paced
1       S063     PY202       2  Loved the examples
2       S022     PY202       4     Could be better
3       S011     PY202       5   Needs improvement
4       S073     WD404       4     Could be better


In [15]:
print("\nMissing Values:")
print(data2.isnull().sum())


Missing Values:
Student_ID       0
Course_ID        0
Rating           0
Feedback_Text    0
dtype: int64


In [8]:
duplicates = data2.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 0


In [16]:
# Convert 'Rating' to integer
data2['Rating'] = data2['Rating'].astype(int)


In [10]:
# Ensure 'Feedback_Text' is of string type
data2['Feedback_Text'] = data2['Feedback_Text'].astype(str)


In [17]:
# Ensure 'Rating' is within the expected range (1-5)
data2 = data2[(data2['Rating'] >= 1) & (data2['Rating'] <= 5)]

In [18]:
print("\nCleaned Data:")
print(data2.head())


Cleaned Data:
  Student_ID Course_ID  Rating       Feedback_Text
0       S057     UX303       2      Too fast-paced
1       S063     PY202       2  Loved the examples
2       S022     PY202       4     Could be better
3       S011     PY202       5   Needs improvement
4       S073     WD404       4     Could be better


In [20]:
from google.colab import files
data2.to_csv("cleaned_feedback.csv", index=False)

files.download("cleaned_feedback.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>