In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from google.colab import files
from sklearn.preprocessing import MinMaxScaler


url = "https://raw.githubusercontent.com/ValentineKornel/config/refs/heads/main/StudentPerformanceFactors.csv"
df = pd.read_csv(url)
df.drop("Learning_Disabilities", axis=1, inplace=True)
df.drop("Teacher_Quality", axis=1, inplace=True)
df.dtypes

df['Exam_Score'].max()
max_index = df['Exam_Score'].idxmax()
print("Индекс максимального элемента:", max_index)
df.at[max_index, 'Exam_Score'] = 100

df['Parental_Involvement'].value_counts()
df["Parental_Involvement"]=df["Parental_Involvement"].map({"Low": 1, "Medium": 2, "High":3})

df['Access_to_Resources'].value_counts()
df["Access_to_Resources"]=df["Access_to_Resources"].map({"Low": 1, "Medium": 2, "High":3})

df['Extracurricular_Activities'].value_counts()
df["Extracurricular_Activities"]=df["Extracurricular_Activities"].map({"Yes": 1, "No": 0})

df['Motivation_Level'].value_counts()
df["Motivation_Level"]=df["Motivation_Level"].map({"Low": 1, "Medium": 2, "High":3})

df['Internet_Access'].value_counts()
df["Internet_Access"]=df["Internet_Access"].map({"Yes": 1, "No": 0})

df['Family_Income'].value_counts()
df["Family_Income"]=df["Family_Income"].map({"Low": 1, "Medium": 2, "High":3})

df['School_Type'].value_counts()
df["School_Type"]=df["School_Type"].map({"Public": 1, "Private": 1})

df['Peer_Influence'].value_counts()
df["Peer_Influence"]=df["Peer_Influence"].map({"Negative": -1, "Neutral": 0, "Positive":1})

df['Parental_Education_Level'].value_counts()
df["Parental_Education_Level"]=df["Parental_Education_Level"].map({"College": 1, "High_School": 2, "Postgraduate":3})

df['Distance_from_Home'].value_counts()
df["Distance_from_Home"]=df["Distance_from_Home"].map({"Near": 1, "Moderate": 2, "Far":3})

df['Gender'].value_counts()
df["Gender"]=df["Gender"].map({"Male": 0, "Female": 1})
df.head()


columns_with_nans = ["Hours_Studied", "Sleep_Hours", "Access_to_Resources", "Internet_Access", "Parental_Involvement"]
df[columns_with_nans] = df[columns_with_nans].astype(float)

mask = np.random.rand(df.shape[0], len(columns_with_nans)) < 0.1

df.loc[:, columns_with_nans] = df.loc[:, columns_with_nans].mask(mask)

df.head(10)
df.dtypes

df.to_csv('StudentPerformanceFactors.csv', index=False)
files.download('StudentPerformanceFactors.csv')


url = "https://raw.githubusercontent.com/ValentineKornel/config/refs/heads/main/StudentPerformanceFactors_withMissings.csv"
df = pd.read_csv(url)

df['Exam_Score'].max()
max_index = df['Exam_Score'].idxmax()
print("Индекс максимального элемента:", max_index)
df.at[max_index, 'Exam_Score'] = 100

df.isnull().any()


#тепловая карта
cols = df.columns[:]
colours = ['#eeeeee', '#ff0000']
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))


#количество нулевых значений
for col in df.columns:
    pct_missing = df[col].isnull().sum()
    print('{} - {}'.format(col, round(pct_missing)))

#процент нулевых значений
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

#удаляем столбец parental_education_level т.к там слишком много пропусков
df=df.drop(["Parental_Education_Level"],axis=1)
df.head()

#остальные пропуски заменяем средними значениями
df["Hours_Studied"]=df["Hours_Studied"].fillna(df["Hours_Studied"].mean())
df["Parental_Involvement"]=df["Parental_Involvement"].fillna(df["Parental_Involvement"].mode()[0])
df["Access_to_Resources"]=df["Access_to_Resources"].fillna(df["Access_to_Resources"].mode()[0])
df["Sleep_Hours"]=df["Sleep_Hours"].fillna(df["Sleep_Hours"].mean())
df["Internet_Access"]=df["Internet_Access"].fillna(df["Internet_Access"].mode()[0])
df["Distance_from_Home"]=df["Distance_from_Home"].fillna(df["Distance_from_Home"].mode()[0])
df.head()

plt.boxplot(df['Exam_Score'])
plt.ylabel("Exam Score")
plt.title("Box Plot of Exam Scores")
plt.show()


#убираем выбросы
Q1 = df["Exam_Score"].quantile(0.25)
Q3 = df["Exam_Score"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_no_outliers = df[(df["Exam_Score"] >= lower_bound) & (df["Exam_Score"] <= upper_bound)]

plt.boxplot(df_no_outliers['Exam_Score'])
plt.ylabel("Exam Score")
plt.title("Box Plot of Exam Scores")
plt.show()


Q1 = df_no_outliers["Hours_Studied"].quantile(0.25)
Q3 = df_no_outliers["Hours_Studied"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_no_outliers = df_no_outliers[(df_no_outliers["Hours_Studied"] >= lower_bound) & (df_no_outliers["Hours_Studied"] <= upper_bound)]

plt.boxplot(df_no_outliers['Hours_Studied'])
plt.show()


Q1 = df_no_outliers["Attendance"].quantile(0.25)
Q3 = df_no_outliers["Attendance"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_no_outliers = df_no_outliers[(df_no_outliers["Attendance"] >= lower_bound) & (df_no_outliers["Attendance"] <= upper_bound)]

plt.boxplot(df['Attendance'])
plt.show()

print(f"Количество строк до удаления выбросов: {len(df)}")
print(f"Количество строк после удаления выбросов: {len(df_no_outliers)}")


df = df_no_outliers

scaler = MinMaxScaler(feature_range=(0, 1))
normalized_data = scaler.fit_transform(df)
df_normalized = pd.DataFrame(normalized_data, columns=df.columns)
df_normalized.head()

df = df_normalized;
df.head()

df.to_csv('StudentPerformanceFactors_Normalized.csv', index=False)
files.download('StudentPerformanceFactors_Normalized.csv')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>