# EXAM PROJECT

In [14]:
# We have been given a data set which is in a csv (comma-separatedvalues) file.
# First and foremost we have to read the dataset in a pandas dataframe and to do this we have steps which will be listed below.
# A dataframe is a 2 Dimensional table.

# To be able to read our dataset into pandas we have to;
# A. upload the dataset into our jupyter notebook folder,
# so it can be accessesible by the computer without throwing an error of file not found
# B. import pandas under any alias mine i choose pd, yours you can choose pnd, e.t.c.
# Pandas is a software library written for the Python programming language for data manipulation and analysis.

import pandas as pd

In [15]:
def read_dataset(file_path):

    """
    This function reads a dataset from a file into a pandas DataFrame and handles errors gracefully.

    Parameters:
    file_path (str): The path to the dataset file.

    Returns:
    pd.DataFrame or None: Returns the DataFrame if successful, otherwise None.
    """
    try:
        # Try reading the file
        data_set = pd.read_csv(file_path)
        print("File loaded successfully.")
        return data_set
    except FileNotFoundError:
        # Handle the case where the file is not found
        print(f"Error: The file at path '{file_path}' was not found.")
    except pd.errors.EmptyDataError:
        # Handle the case where the file is empty
        print(f"Error: The file at path '{file_path}' is empty.")
    except pd.errors.ParserError:
        # Handle the case where the file contents are not parseable
        print(f"Error: The file at path '{file_path}' could not be parsed. Please check if it's a valid CSV.")
    except Exception as e:
        # Catch all other exceptions and print a generic message
        print(f"An unexpected error occurred: {str(e)}")
    
    # Return None if an error occurred
    return None

In [16]:
read_dataset('dfa.csv')

Error: The file at path 'dfa.csv' was not found.


In [18]:
read_dataset('empty_dataset.csv')

Error: The file at path 'empty_dataset.csv' is empty.


In [19]:
read_dataset('dataset_31_credit-g.arff')

Error: The file at path 'dataset_31_credit-g.arff' could not be parsed. Please check if it's a valid CSV.


In [20]:
read_dataset('titanic.c')

Error: The file at path 'titanic.c' was not found.


In [21]:
# Our file has been given a variable name and read as a csv file by pandas into a dataframe

data_set = read_dataset('titanic.csv')   
# Example: After uploading, it doesn't throw an error. Thus, data set was successfully read in a pandas dataframe

File loaded successfully.


In [22]:
data_set.head()       #To read the first 5 rows of our data use the function .head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
type(data_set)       # This is a built-in function.
                     # It is used to return the type of data stored in the objects or variables in the program. 

pandas.core.frame.DataFrame

Showing the sum of the missung value for each column

In [24]:
# This function sums the missing values for each column

def missing_values(file):                                          #reading the dataset into a dataframe
    """This sums the number of missing values in a column"""
    import pandas as pd                                            #Importing pandas as a part of the function to avoid repeating the step
     
    try:                                                           #handling possible errors
        file = pd.read_csv(file)
        missing_values = file.isnull()     #brings out a boolean output (true/ false)
        return missing_values
    except FileNotFoundError:
        print("Error: The file was not found.")  
    except TypeError:
        print("Error: Invalid Format.")
    except NameError as ne:
        print(f"Error: {file} not defined: {ne}")
    except ValueError:
        print("Error: Invalid Value.")
    except pd.error.EmptyDataError:
        print("Error: The file is empty.")
    except pd.error.ParserError:
        print("Error: The file is corrupted or contains invalid data.")
    except Exception as e:
        print(f"An unexpected error was encountered: {e}" )
# isna() method checks whether the objects of a Dataframe or a Series contain missing or null values (NA, NaN)
# And returns a new object with the same shape as the original but with boolean values True or False as the elements.

In [25]:
missing_values('titanic.csv').head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False


In [26]:
missing_values('dataset_31_credit-g.arff')

Error: Invalid Value.


In [27]:
missing_values('empty_data.csv')

Error: The file was not found.


In [29]:
missing_values(data_set)

Error: Invalid Format.


In [30]:
# To count the number of NaN values in a specific column in a Pandas DataFrame, we can use the isna() and sum() functions.
# The sum() function returns the sum of True values, which equals the number of NaN values in the column.

# This shows the sum / amount of rows with missing values in thier column. A new variable name was given

# This function sums the missing values for each column

def missing_values(file):                                          #reading the dataset into a dtaframe
    """This sums the number of missing values in a column"""
    import pandas as pd                                            #Importing pandas as a part of the function to avoid repeating the step
     
    try:                                                           #handling possible errors
        file = pd.read_csv(file) 
        missing_values = file.isnull().sum()
        return missing_values
    except FileNotFoundError:
        print("Error: The file was not found.")
    except TypeError:
        print("Error: Invalid Format.")
    except NameError as ne:
        print(f"Error: {file} not defined: {ne}")
    except ValueError:
        print("Error: Invalid Value.")
    except pd.error.EmptyDataError:
        print("Error: The file is empty.")
    except pd.error.ParserError:
        print("Error: The file is corrupted or contains invalid data.")
    except Exception as e:
        print(f"An unexpected error was encountered: {e}" )

In [31]:
missing_values('titanic.csv')

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [32]:
missing_values('dataset_31_credit-g.arff')

Error: Invalid Value.


In [33]:
missing_values('empty_data.csv')

Error: The file was not found.


In [34]:
missing_values(data_set)

Error: Invalid Format.


Checking for duplicates in each column function returns a dictionary and boolean values indicating the presence of duplicates.

In [35]:
def check_duplicates(file_name):
    """This function checks for duplicates in each column and 
    returns the result in a dictionary as a boolean value."""

    import pandas as pd                                               #Importing pandas as a part of the function to avoid repeating the step
    
    duplicate_dict = {}                                               # Initialize the dictionary to store results
    
    try:
        file_name = pd.read_csv(file_name)                             # Read the CSV file into a DataFrame
        
        for column in file_name.columns:                               # Iterate through each column and check for duplicates
            has_duplicates = file_name[column].duplicated().any()
            duplicate_dict[column] = has_duplicates                    # Store the result in the dictionary
        
        return duplicate_dict                                          # Return the dictionary with the results

    except FileNotFoundError:
        print("Error: The file was not found.")
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: The file is corrupted or contains invalid data.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [36]:
check_duplicates('empty_data.csv')

Error: The file was not found.


In [37]:
check_duplicates('dataset_31_credit-g.arff')

Error: The file is corrupted or contains invalid data.


In [38]:
check_duplicates('titanic.c')

Error: The file was not found.


In [39]:
check_duplicates('titanic.csv')

{'PassengerId': False,
 'Survived': True,
 'Pclass': True,
 'Name': False,
 'Sex': True,
 'Age': True,
 'SibSp': True,
 'Parch': True,
 'Ticket': True,
 'Fare': True,
 'Cabin': True,
 'Embarked': True}

In [41]:
def check_duplicates_sum(file_name):
    """This function checks for duplicates in each column and 
    returns the result in a dictionary as a boolean value."""

    import pandas as pd                                               #Importing pandas as a part of the function to avoid repeating the step
    
    duplicate_dict = {}                                               # Initialize the dictionary to store results
    
    try:
        file_name = pd.read_csv(file_name)                             # Read the CSV file into a DataFrame
        
        for column in file_name.columns:                               # Iterate through each column and check for duplicates
            has_duplicates = file_name[column].duplicated().sum()
            duplicate_dict[column] = has_duplicates                    # Store the result in the dictionary
        
        return duplicate_dict                                          # Return the dictionary with the results

    except FileNotFoundError:
        print("Error: The file was not found.")
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: The file is corrupted or contains invalid data.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [42]:
check_duplicates_sum('empty_data.csv')

Error: The file was not found.


In [43]:
check_duplicates_sum('dataset_31_credit-g.arff')

Error: The file is corrupted or contains invalid data.


In [44]:
check_duplicates_sum('titanic.c')

Error: The file was not found.


In [45]:
check_duplicates_sum('titanic.csv')

{'PassengerId': 0,
 'Survived': 889,
 'Pclass': 888,
 'Name': 0,
 'Sex': 889,
 'Age': 802,
 'SibSp': 884,
 'Parch': 884,
 'Ticket': 210,
 'Fare': 643,
 'Cabin': 743,
 'Embarked': 887}