In [12]:
# 1. Load and inspect the Data:

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

student_maths = pd.read_csv('student-mat.csv', delimiter=';')
student_portu = pd.read_csv('student-por.csv', delimiter=';')

print(student_maths.head())
print(student_portu.head())

print(f" Maths shape: {student_maths.shape}")
print(f" Portuguese shape: {student_portu.shape}")

print(student_maths.info()) # Summary of data types and missing values
print(student_portu.info()) 

print(student_maths.describe())  # Statistics for numerical columns
print(student_portu.describe())

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        6   5   6   6  
1      5        3      3     1     1      3        4   5   5   6  
2      4        3      2     2     3      3       10   7   8  10  
3      3        2      2     1     1      5        2  15  14  15  
4      4        3      2     1     2      5        4   6  10  10  

[5 rows x 33 columns]
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     G

In [None]:
# Skills to showcase:
# Predictive analysis (e.g., predicting trends using Python or R)
# Exploratory data analysis (EDA) with real-world datasets
# Machine learning models (classification, regression, clustering)

# python skills
# Created a Python script using pandas to automate the cleaning and preparation of raw datasets, reducing manual processing time.
# Conducted exploratory data analysis (EDA) to identify trends, missing values, and outliers.
# Result: Improved data readiness for analysis and optimised workflow efficiency.

# tableau skills
# Developed an interactive Tableau dashboard to analyse patient outcome data, identifying trends and areas for improvement.
# Demonstrated skills in data cleaning, visualisation, and storytelling to communicate insights effectively.
# Result: Presented actionable insights that could inform healthcare decisions and resource allocation.


#Questions to ask for this data:
# Academic performance:
# 1. What factors most influence final grades (G3)? Analyze the relationship between variables like studytime, failures, attendance with G3. Use regression or correlation analysis to identify key predictors of academic success.
# 2. Does weekly study time (studytime) correlate with better performance in Mathematics or Portuguese? Compare the average G3 scores across different levels of studytime.
# 3. How do past failures (failures) impact students’ final grades? Investigate if students with more failures consistently perform worse.
# Student Support ad Actviities:
# 4. Does receiving educational support (schoolsup, famsup) improve final grades? Compare G3 between students who have support and those who don’t.
# 5. Do extra paid classes (paid) correlate with better academic outcomes? Analyze the impact of paid on grades (G3) in Mathematics or Portuguese.
# Demographics:
# 6. Does the type of guardian (guardian) impact academic success? Compare grades (G3) based on whether the guardian is mother, father, or other.
# Health and attendance:
# 7. What is the relationship between alcohol consumption (Dalc, Walc) and academic performance? Compare G3 scores across different levels of alcohol consumption.
# Exploring relationships:
# 8. What is the relationship between first-period grades (G1) and final grades (G3)? Perform regression analysis to see how well G1 predicts G3.
# Cross-Dataset comparisons:
# 9. Do students perform better in Mathematics or Portuguese on average? Calculate and compare mean G3 scores for the two subjects.
# 10. Are there significant differences in predictors of success between Mathematics and Portuguese? Compare the key predictors of G3 in student-mat.csv and student-por.csv.



# Things to do:
# 1. Clean and Transform the Dataset 
# - Handle missing values, duplicates, and irrelevant data.
# - Normalize or scale features as needed for analysis or modeling.

# 2. Perform Exploratory Data Analysis (EDA)
# - Summarize the data using statistics (mean, median, correlations).
# - Create visualizations (histograms, scatter plots) to identify trends or to show which factors (e.g., study time, parent involvement) most impact performance.

# 3. Apply Statistical and Machine Learning Techniques
# - Use the appropriate statistical tests (e.g., correlation, ANOVA) to answer the question or predict grades
# - Train and evaluate machine learning models (e.g., regression, classification).

# 4. Visualize 
# - Create meaningful visualizations (e.g., heatmaps, bar plots).

# 5. Summarise
# - Write a concise summary highlighting the main takeaways and recommendations (e.g., "More study time leads to better performance, but only up to a certain point.").



In [27]:
# 2. Clean data and missing values

print(student_maths.isnull().sum())
print(student_portu.isnull().sum())

# student_maths['reason'] = student_maths['reason'].str.lower()
# binary_columns = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
# for col in binary_columns:
#     student_maths[col] = student_maths[col].map({'yes': 1, 'no': 0})

# student_portu['reason'] = student_portu['reason'].str.lower()
# binary_columns = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
# for col in binary_columns:
#     student_portu[col] = student_portu[col].map({'yes': 1, 'no': 0})


student_maths = pd.read_csv('student-mat.csv', delimiter=';')
student_portu = pd.read_csv('student-por.csv', delimiter=';')

print(student_maths.head())
print(student_portu.head())

print(f" Maths shape: {student_maths.shape}")
print(f" Portuguese shape: {student_portu.shape}")

print(student_maths.info()) # Summary of data types and missing values
print(student_portu.info()) 


#  Define binary columns
binary_columns = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

# Ensure values are strings and clean them
for col in binary_columns:
    student_maths[col] = student_maths[col].astype(str).str.strip().str.lower()

# Map "yes" to 1 and "no" to 0
for col in binary_columns:
    student_maths[col] = student_maths[col].map({'yes': 1, 'no': 0})
    
# inspect the columns    
for col in binary_columns:
    print(f"{col}: {student_maths[col].unique()}")
    
# preview the column changes
print(student_maths['schoolsup'].head())



school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health     