In [2]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=e64efee539f8f35c5ac28c3b144645d1394e0bcd64e4b3c9976893730cf7fae1
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fpdf import FPDF
import os

# Set paths for storing the data and results
data_dir = '/content/drive/MyDrive/PDS_Assignment/Question_2/data/'
results_dir = '/content/drive/MyDrive/PDS_Assignment/Question_2/results/'

# Create directories if they don't exist
os.makedirs(data_dir, exist_ok=True)
os.makedirs(results_dir + '/visualizations', exist_ok=True)

# Sample dataset URL or local file path
df = pd.read_csv("/content/drive/MyDrive/PDS_Assignment/Question_2/data/StudentsPerformance.csv")

# Save original data to CSV
df.to_csv(data_dir + 'student_performance_data.csv', index=False)

# Check the data types to identify categorical columns
print(df.dtypes)

# Cleaned Data: Convert categorical columns to numerical values (if necessary)
df['gender'] = df['gender'].map({'male': 0, 'female': 1})
df['lunch'] = df['lunch'].map({'standard': 0, 'free/reduced': 1})
df['test preparation course'] = df['test preparation course'].map({'none': 0, 'completed': 1})

# If there's a 'group' column (e.g., 'group A', 'group B'), convert it to numeric
# Assuming that the column name is 'group', adjust the name if it's different
if 'group' in df.columns:
    df['group'] = df['group'].map({'group A': 0, 'group B': 1})

# Save cleaned data to CSV
df.to_csv(data_dir + 'student_performance_cleaned_data.csv', index=False)

# Calculate correlations for numerical columns
correlation_math_reading = df['math score'].corr(df['reading score'])
correlation_math_writing = df['math score'].corr(df['writing score'])
correlation_reading_writing = df['reading score'].corr(df['writing score'])

# Processed Data (correlation results, in this case we keep the dataframe unchanged for now)
processed_data = df.copy()  # Keep a copy of the cleaned data as processed data

# Save processed data to CSV
processed_data.to_csv(data_dir + 'student_performance_processed_data.csv', index=False)

# Scatter Plot: Math Score vs. Reading Score
plt.figure(figsize=(8, 6))
sns.scatterplot(x='math score', y='reading score', data=df)
plt.title('Math Score vs. Reading Score')
plt.xlabel('Math Score')
plt.ylabel('Reading Score')
plt.savefig(results_dir + 'visualizations/math_reading_scatter.png')
plt.close()

# Box Plot: Math Score by Gender
plt.figure(figsize=(8, 6))
sns.boxplot(x='gender', y='math score', data=df)
plt.title('Math Score by Gender')
plt.xlabel('Gender')
plt.ylabel('Math Score')
plt.savefig(results_dir + 'visualizations/gender_math_boxplot.png')
plt.close()

# Histogram: Distribution of Math Scores
plt.figure(figsize=(8, 6))
sns.histplot(df['math score'], kde=True)
plt.title('Distribution of Math Scores')
plt.xlabel('Math Score')
plt.ylabel('Frequency')
plt.savefig(results_dir + 'visualizations/math_score_histogram.png')
plt.close()

# Count Plot: Test Preparation Course Completion
plt.figure(figsize=(8, 6))
sns.countplot(x='test preparation course', data=df)
plt.title('Test Preparation Course Completion')
plt.xlabel('Test Preparation Course')
plt.ylabel('Count')
plt.savefig(results_dir + 'visualizations/test_preparation_countplot.png')
plt.close()

# Pairplot: Relationships between numerical features (Math, Reading, and Writing scores)
sns.pairplot(df[['math score', 'reading score', 'writing score']])
plt.savefig(results_dir + 'visualizations/pairplot.png')
plt.close()

# Save correlation results to a text file
with open(results_dir + 'analysis_results.txt', 'w') as f:
    f.write(f"Correlation between math and reading score: {correlation_math_reading}\n")
    f.write(f"Correlation between math and writing score: {correlation_math_writing}\n")
    f.write(f"Correlation between reading and writing score: {correlation_reading_writing}\n")
    f.write("Visualizations: math_reading_scatter, gender_math_boxplot, math_score_histogram, test_preparation_countplot, pairplot")

# Create a PDF report with visualizations and analysis
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", 'B', 16)
pdf.cell(200, 10, txt="Student Performance Analysis Report", ln=True, align="C")

pdf.set_font("Arial", 'B', 12)
pdf.ln(10)
pdf.cell(200, 10, txt="1. Correlations", ln=True)

pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, txt=f"Correlation between math and reading score: {correlation_math_reading:.2f}\n"
                          f"Correlation between math and writing score: {correlation_math_writing:.2f}\n"
                          f"Correlation between reading and writing score: {correlation_reading_writing:.2f}")

pdf.ln(10)
pdf.cell(200, 10, txt="2. Visualizations", ln=True)

pdf.image(results_dir + "visualizations/math_reading_scatter.png", x=10, y=pdf.get_y(), w=180)

pdf.ln(90)
pdf.image(results_dir + "visualizations/gender_math_boxplot.png", x=10, y=pdf.get_y(), w=180)

pdf.ln(90)
pdf.image(results_dir + "visualizations/math_score_histogram.png", x=10, y=pdf.get_y(), w=180)

pdf.ln(90)
pdf.image(results_dir + "visualizations/test_preparation_countplot.png", x=10, y=pdf.get_y(), w=180)

pdf.ln(90)
pdf.image(results_dir + "visualizations/pairplot.png", x=10, y=pdf.get_y(), w=180)

pdf.output(results_dir + "student_performance_analysis_report.pdf")


gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object


''