<a href="https://colab.research.google.com/github/PrathamKumar125/Analysis-Report-Maker/blob/main/Analysis_Report_maker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=cbc1c3cbcd5cce70c1feae79a9f140dd71d3850e4e004f22e10c7b0cb780ca8c
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
import os

In [4]:
df = pd.read_csv('/content/employee.csv')

In [6]:
def list_missing_values(df):
    missing_values = df.isnull().sum()
    return missing_values[missing_values > 0]

list_missing_values(df)

Unnamed: 0,0


In [7]:
def categorize_columns(df):
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
    return numeric_cols, categorical_cols

categorize_columns(df)

(['EMPLOYEE_ID', 'SALARY', 'DEPARTMENT_ID'],
 ['FIRST_NAME',
  'LAST_NAME',
  'EMAIL',
  'PHONE_NUMBER',
  'HIRE_DATE',
  'JOB_ID',
  'COMMISSION_PCT',
  'MANAGER_ID'])

In [8]:
def list_and_remove_duplicates(df):
    duplicates = df[df.duplicated()]
    df_no_duplicates = df.drop_duplicates()
    return duplicates, df_no_duplicates

list_and_remove_duplicates(df)

(Empty DataFrame
 Columns: [EMPLOYEE_ID, FIRST_NAME, LAST_NAME, EMAIL, PHONE_NUMBER, HIRE_DATE, JOB_ID, SALARY, COMMISSION_PCT, MANAGER_ID, DEPARTMENT_ID]
 Index: [],
     EMPLOYEE_ID   FIRST_NAME    LAST_NAME     EMAIL  PHONE_NUMBER  HIRE_DATE  \
 0           198       Donald     OConnell  DOCONNEL  650.507.9833  21-JUN-07   
 1           199      Douglas        Grant    DGRANT  650.507.9844  13-JAN-08   
 2           200     Jennifer       Whalen   JWHALEN  515.123.4444  17-SEP-03   
 3           201      Michael    Hartstein  MHARTSTE  515.123.5555  17-FEB-04   
 4           202          Pat          Fay      PFAY  603.123.6666  17-AUG-05   
 5           203        Susan       Mavris   SMAVRIS  515.123.7777  07-JUN-02   
 6           204      Hermann         Baer     HBAER  515.123.8888  07-JUN-02   
 7           205      Shelley      Higgins  SHIGGINS  515.123.8080  07-JUN-02   
 8           206      William        Gietz    WGIETZ  515.123.8181  07-JUN-02   
 9           100       

In [16]:
def list_and_remove_constants(df):
    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
    df_no_constants = df.drop(columns=constant_cols)
    return constant_cols, df_no_constants

list_and_remove_constants(df)

(['COMMISSION_PCT'],
     EMPLOYEE_ID   FIRST_NAME    LAST_NAME     EMAIL  PHONE_NUMBER  HIRE_DATE  \
 0           198       Donald     OConnell  DOCONNEL  650.507.9833  21-JUN-07   
 1           199      Douglas        Grant    DGRANT  650.507.9844  13-JAN-08   
 2           200     Jennifer       Whalen   JWHALEN  515.123.4444  17-SEP-03   
 3           201      Michael    Hartstein  MHARTSTE  515.123.5555  17-FEB-04   
 4           202          Pat          Fay      PFAY  603.123.6666  17-AUG-05   
 5           203        Susan       Mavris   SMAVRIS  515.123.7777  07-JUN-02   
 6           204      Hermann         Baer     HBAER  515.123.8888  07-JUN-02   
 7           205      Shelley      Higgins  SHIGGINS  515.123.8080  07-JUN-02   
 8           206      William        Gietz    WGIETZ  515.123.8181  07-JUN-02   
 9           100       Steven         King     SKING  515.123.4567  17-JUN-03   
 10          101        Neena      Kochhar  NKOCHHAR  515.123.4568  21-SEP-05   
 11    

In [11]:
def create_box_plots(df, output_dir):
    numeric_cols = df.select_dtypes(include=['number']).columns
    for col in numeric_cols:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[col])
        plt.title(f'Box plot of {col}')
        plt.savefig(os.path.join(output_dir, f'boxplot_{col}.png'))
        plt.close()

create_box_plots(df, '/content/')

In [12]:
def create_distribution_charts(df, output_dir):
    sample_cols = df.columns[:6]
    for col in sample_cols:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.savefig(os.path.join(output_dir, f'distribution_{col}.png'))
        plt.close()

create_distribution_charts(df, '/content/')

In [17]:
def generate_report(df, output_dir='/content/report'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    missing_values = list_missing_values(df)
    numeric_cols, categorical_cols = categorize_columns(df)
    duplicates, df_no_duplicates = list_and_remove_duplicates(df)
    constant_cols, df_no_constants = list_and_remove_constants(df)

    create_box_plots(df, output_dir)
    create_distribution_charts(df, output_dir)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    pdf.cell(200, 10, txt="Data Report", ln=True, align='C')

    pdf.cell(200, 10, txt="Missing Values:", ln=True)
    for col, val in missing_values.items():
        pdf.cell(200, 10, txt=f"{col}: {val}", ln=True)

    pdf.cell(200, 10, txt="Numeric Columns:", ln=True)
    for col in numeric_cols:
        pdf.cell(200, 10, txt=col, ln=True)

    pdf.cell(200, 10, txt="Categorical Columns:", ln=True)
    for col in categorical_cols:
        pdf.cell(200, 10, txt=col, ln=True)

    pdf.cell(200, 10, txt="Duplicates Before Removal:", ln=True)
    pdf.cell(200, 10, txt=str(duplicates), ln=True)

    pdf.cell(200, 10, txt="Duplicates After Removal:", ln=True)
    pdf.cell(200, 10, txt=str(df_no_duplicates), ln=True)

    pdf.cell(200, 10, txt="Constant Columns:", ln=True)
    for col in constant_cols:
        pdf.cell(200, 10, txt=col, ln=True)

    pdf.output(os.path.join(output_dir, "data_report.pdf"))

In [18]:
# Generate the report
generate_report(df)