# DATA INGESTION AND WRANGLING


## 1) Formats chosen: 

* .jpg/.jpeg
* .mp4
* .xlsx
* .json
* .csv


## 2) Python functions for loading data in different formats 

### 2.1) load_jpeg_image

In [None]:
from PIL import Image
from IPython.display import display

def load_jpeg_image(file_path):

    image = Image.open(file_path)

    #display(image)
    return image

# Example usage:
image = load_jpeg_image('C:/Users/acm11/Pictures/Mønster-prøve.jpg')

# This opens the image in a (the computers default?) application for showing images:
#image.show()



### 2.2) load_mp4_video

In [None]:
import cv2
import time
import matplotlib.pyplot as plt

def load_mp4_video(file_path):

    # Creating a VideoCapture object (which create a way) to read video(frames) from file
    video = cv2.VideoCapture(file_path)

    # Initializing a variable with a list for video frames
    frames = []

    # Timin how long it takes to load the frames
    start_time = time.time()
    
    print("Starting to load frames...")
    

    while True:
        # video.read() reads the next frame in the video and updates which frames has been read in the video(VideoCapture) object
        # video.read() returns a 1) boolean value indicating if the frame has been read succesfully and 2) a frame
        fr_read_succes, frame = video.read()
        
        if not fr_read_succes:
            break
            
        frames.append(frame)

    # Stop timing
    end_time = time.time()
    elapsed_time = end_time - start_time

    # Closing the file that the video object was associated with/stops capturing video from a live camera or other live video source
    video.release()

    
    print(f"Frames loaded: {len(frames)}")
    print(f"Total time taken: {elapsed_time:.2f} seconds")
    
    return frames


frames = load_mp4_video('C:/Users/acm11/Videos/SodaVideo.mp4')
    

### 2.3) load_excel_file

In [1]:
import pandas as pd

def load_excel_file(file_path, headerChosen):
    df = pd.read_excel(file_path, header=headerChosen)
    return df

# Example usage
excel_data1 = load_excel_file('C:/Users/acm11/OneDrive/Dokumenter/test_excel_familien.xlsx', 0)
excel_data2 = load_excel_file('C:/Users/acm11/OneDrive/Dokumenter/test_excel_familien.xlsx', 1)

print(excel_data1)
#print(excel_data2)

   id    name  age                                        random_info
0  ID    NAME  AGE                                        RANDOM_INFO
1   1  Tobias   34  Can only ride without hands on one bike in the...
2   2   Jonas   31                           Is the best fisher in NV
3   3   Lykke   63                                 Is a great teacher
4   4    Finn   63                               Is a great guitarist


In [None]:
first_frame_rgb = cv2.cvtColor(frames[0], cv2.COLOR_BGR2RGB)

#plt.imshow(first_frame_rgb)
#plt.title("RGB Image of first frame in soda video")
#plt.axis("off")
#plt.show()


first_frame_image = Image.fromarray(first_frame_rgb)

# Displays the image in another application
#first_frame_image.show(title='First Frame')

# cv2.destroyAllWindows()

# Displaying the image as an array
#display(first_frame_rgb)

### 2.4) load_json_file

In [None]:
import json

def load_json_file(file_path):

    with open(file_path, 'r') as file:
        data = json.load(file)

    df = pd.DataFrame(data)

    return df
    
# Example usage
#df = load_json_file('C:/User/SomeDirectory/SomeFile.json')


### 2.5) load_csv_file

In [5]:
def load_csv_file(file_path):

    df = pd.read_csv(file_path, header=0)

    return df

#df = load_csv_file("C:/User/SomeDirectory/SomeFile.json")

## 3) Using some of my functions to create Data Frames

* 3.1) XLS -> Data Frame
* 3.2) JSON -> Data Frame
* 3.3) CSV -> Data Frame

### 3.1) XLS -> Data Frame


In [3]:
uspres_df = load_excel_file("C:/Users/acm11/BusinessIntelligencedat4/US_Presidents.xlsx", 0)

### 3.2) JSON -> Data Frame

In [None]:
iris_df = load_json_file("C:/Users/acm11/BusinessIntelligencedat4/iris.json")

### 3.3) CSV -> Data Frame

In [7]:
alcohol_df = load_csv_file("C:/Users/acm11/BusinessIntelligencedat4/Alcohol_effect_on_students.csv")

## 4) Exploration and cleaning of data

### 4.1) Exploring

#### 4.1.1) Eploring US Presidents

##### 4.1.1.1) The 5 first rows: .head()

In [13]:
uspres_df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,president,prior,party,vice,salary,date updated,date created
0,0,1,George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan,John Adams,5000,2021-07-14,2012-03-04
1,1,2,john adams,1st Vice President of the United States,Federalist,Thomas Jefferson,10000,2021-07-14,2012-03-04
2,2,3,Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr,15000,2021-07-14,2012-03-04
3,3,4,James Madison,5th United States Secretary of State (1801â...,Democratic- Republican,George Clinton,20000,2021-07-14,2012-03-04
4,4,5,JAMES MONROE,7th United States Secretary of State (1811â...,Democratic- Republican,Daniel D. Tompkins,25000,2021-07-14,2012-03-04


##### 4.1.1.2) Data types: .dtypes

In [16]:
dtypes_series = uspres_df.dtypes
dtypes_df = pd.DataFrame([dtypes_series.index, dtypes_series.values], index=['Column Name', 'Data Type'])
dtypes_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
Column Name,Unnamed: 0,S.No.,president,prior,party,vice,salary,date updated,date created
Data Type,int64,int64,object,object,object,object,int64,datetime64[ns],datetime64[ns]


##### 4.1.1.3) All missing values (NA or NaN): .isna().sum()

In [19]:
na_values = uspres_df.isna().sum()
type(na_values)

pandas.core.series.Series

In [21]:
na_val_df = pd.DataFrame(na_values)
na_val_df

Unnamed: 0,0
Unnamed: 0,0
S.No.,0
president,0
prior,0
party,0
vice,0
salary,0
date updated,0
date created,0


#### 4.1.2) Exploring Iris data

##### 4.1.2.1) All missing values (NA or NaN): .isna().sum()

In [25]:
iris_df.isna().sum()

NameError: name 'iris_df' is not defined

#### 4.1.3) Exploring Data About the Effect of Alcohol Use on Students Performance

##### 4.1.3.2) All missing values (NA or NaN): .isna().sum()

In [29]:
alcohol_df.isna().sum()

Timestamp                                                                                             0
Your Sex?                                                                                             2
Your Matric (grade 12) Average/ GPA (in %)                                                            7
What year were you in last year (2023) ?                                                             73
What faculty does your degree fall under?                                                             7
Your 2023 academic year average/GPA in % (Ignore if you are 2024 1st year student)                   86
Your Accommodation Status Last Year (2023)                                                           23
Monthly Allowance in 2023                                                                            31
Were you on scholarship/bursary in 2023?                                                              8
Additional amount of studying (in hrs) per week                 

##### 4.1.3.3) Data types


In [32]:
dtypes_as_series = alcohol_df.dtypes
dtypes_as_df = pd.DataFrame([dtypes_as_series.index, dtypes_as_series.values], index=['Column Name', 'Data Type'])
dtypes_as_df

#print(alcohol_df.dtypes)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Column Name,Timestamp,Your Sex?,Your Matric (grade 12) Average/ GPA (in %),What year were you in last year (2023) ?,What faculty does your degree fall under?,Your 2023 academic year average/GPA in % (Igno...,Your Accommodation Status Last Year (2023),Monthly Allowance in 2023,Were you on scholarship/bursary in 2023?,Additional amount of studying (in hrs) per week,How often do you go out partying/socialising d...,"On a night out, how many alcoholic drinks do y...",How many classes do you miss per week due to a...,How many modules have you failed thus far into...,Are you currently in a romantic relationship?,Do your parents approve alcohol consumption?,How strong is your relationship with your pare...
Data Type,object,object,float64,object,object,float64,object,object,object,object,object,object,object,object,object,object,object


##### 4.1.3.4) Number of rows

In [35]:
alcohol_df.shape[0]

406

##### 4.1.3.5) Top five rows

In [38]:
alcohol_df.head()

Unnamed: 0,Timestamp,Your Sex?,Your Matric (grade 12) Average/ GPA (in %),What year were you in last year (2023) ?,What faculty does your degree fall under?,Your 2023 academic year average/GPA in % (Ignore if you are 2024 1st year student),Your Accommodation Status Last Year (2023),Monthly Allowance in 2023,Were you on scholarship/bursary in 2023?,Additional amount of studying (in hrs) per week,How often do you go out partying/socialising during the week?,"On a night out, how many alcoholic drinks do you consume?","How many classes do you miss per week due to alcohol reasons, (i.e: being hungover or too tired?)",How many modules have you failed thus far into your studies?,Are you currently in a romantic relationship?,Do your parents approve alcohol consumption?,How strong is your relationship with your parent/s?
0,2024/03/07 5:12:01 pm EET,Female,76.0,2nd Year,Arts & Social Sciences,72.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,8+,Only weekends,8+,3,0,Yes,Yes,Very close
1,2024/03/07 5:12:08 pm EET,Male,89.0,2nd Year,Economic & Management Sciences,75.0,Private accommodation/ stay with family/friends,R 7001 - R 8000,"Yes (NSFAS, etc...)",8+,Only weekends,3-5,4+,0,No,Yes,Very close
2,2024/03/07 5:12:25 pm EET,Male,76.0,1st Year,AgriSciences,55.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,3-5,2,8+,3,0,No,Yes,Very close
3,2024/03/07 5:12:28 pm EET,Male,89.0,2nd Year,Engineering,84.0,Private accommodation/ stay with family/friends,R 6001 - R 7000,No,3-5,3,8+,2,0,Yes,Yes,Very close
4,2024/03/07 5:13:00 pm EET,Female,74.0,2nd Year,Arts & Social Sciences,52.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,3-5,Only weekends,5-8,1,3,No,Yes,Fair


##### 4.1.3.6) Renaming columns


In [41]:
#alcohol_df.columns.values[0] = "Timestamp"
#alcohol_df.columns.values[1] = "Sex"
#alcohol_df.columns.values[2] = "Average_GPA"
#alcohol_df.columns.values[3] = "Year_at_school_last_year"
#alcohol_df.columns.values[4] = "Faculty"
#alcohol_df.columns.values[5] = "2023_average_GPA"
#alcohol_df.columns.values[6] = "Accomodation_status"
#alcohol_df.columns.values[7] = "Allowance_2023"
#alcohol_df.columns.values[8] = "Scholarship_2023"
#alcohol_df.columns.values[9] = "Additional_studying"
#alcohol_df.columns.values[10] = "Partying_weekly"
#alcohol_df.columns.values[11] = "Number_of_drinks"
#alcohol_df.columns.values[12] = "Missed_classes_due_to_alcohol"
#alcohol_df.columns.values[13] = "Modules_failed"
#alcohol_df.columns.values[14] = "Relationship_status"
#alcohol_df.columns.values[15] = "Parental_approval"
#alcohol_df.columns.values[16] = "Strength_of_relation_with_parents"


In [43]:
alcohol_df.head()

Unnamed: 0,Timestamp,Your Sex?,Your Matric (grade 12) Average/ GPA (in %),What year were you in last year (2023) ?,What faculty does your degree fall under?,Your 2023 academic year average/GPA in % (Ignore if you are 2024 1st year student),Your Accommodation Status Last Year (2023),Monthly Allowance in 2023,Were you on scholarship/bursary in 2023?,Additional amount of studying (in hrs) per week,How often do you go out partying/socialising during the week?,"On a night out, how many alcoholic drinks do you consume?","How many classes do you miss per week due to alcohol reasons, (i.e: being hungover or too tired?)",How many modules have you failed thus far into your studies?,Are you currently in a romantic relationship?,Do your parents approve alcohol consumption?,How strong is your relationship with your parent/s?
0,2024/03/07 5:12:01 pm EET,Female,76.0,2nd Year,Arts & Social Sciences,72.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,8+,Only weekends,8+,3,0,Yes,Yes,Very close
1,2024/03/07 5:12:08 pm EET,Male,89.0,2nd Year,Economic & Management Sciences,75.0,Private accommodation/ stay with family/friends,R 7001 - R 8000,"Yes (NSFAS, etc...)",8+,Only weekends,3-5,4+,0,No,Yes,Very close
2,2024/03/07 5:12:25 pm EET,Male,76.0,1st Year,AgriSciences,55.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,3-5,2,8+,3,0,No,Yes,Very close
3,2024/03/07 5:12:28 pm EET,Male,89.0,2nd Year,Engineering,84.0,Private accommodation/ stay with family/friends,R 6001 - R 7000,No,3-5,3,8+,2,0,Yes,Yes,Very close
4,2024/03/07 5:13:00 pm EET,Female,74.0,2nd Year,Arts & Social Sciences,52.0,Private accommodation/ stay with family/friends,R 4001- R 5000,No,3-5,Only weekends,5-8,1,3,No,Yes,Fair


##### 4.1.3.7) Showing all rows with no values (all are NaN)

In [57]:
rows_with_nan = alcohol_df[alcohol_df.isna().any(axis=1)]

In [59]:
rows_with_nan

Unnamed: 0,Timestamp,Your Sex?,Your Matric (grade 12) Average/ GPA (in %),What year were you in last year (2023) ?,What faculty does your degree fall under?,Your 2023 academic year average/GPA in % (Ignore if you are 2024 1st year student),Your Accommodation Status Last Year (2023),Monthly Allowance in 2023,Were you on scholarship/bursary in 2023?,Additional amount of studying (in hrs) per week,How often do you go out partying/socialising during the week?,"On a night out, how many alcoholic drinks do you consume?","How many classes do you miss per week due to alcohol reasons, (i.e: being hungover or too tired?)",How many modules have you failed thus far into your studies?,Are you currently in a romantic relationship?,Do your parents approve alcohol consumption?,How strong is your relationship with your parent/s?
5,2024/03/07 5:13:18 pm EET,Male,83.0,,Engineering,,Private accommodation/ stay with family/friends,R 6001 - R 7000,No,8+,4+,3-5,3,1,Yes,Yes,Very close
8,2024/03/07 5:13:28 pm EET,Male,89.0,,Engineering,,,R 5001 - R 6000,No,8+,Only weekends,3-5,0,,No,Yes,Very close
9,2024/03/07 5:13:35 pm EET,Male,83.0,,Science,,,R 6001 - R 7000,No,3-5,3,5-8,4+,0,No,Yes,Close
10,2024/03/07 5:14:04 pm EET,Female,75.0,,AgriSciences,,,R 4001- R 5000,No,5-8,2,5-8,0,0,No,Yes,Close
14,2024/03/07 5:14:35 pm EET,Male,78.0,,Science,,,,,3-5,2,5-8,0,0,No,Yes,Very close
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,2024/03/11 7:41:43 pm EET,Male,85.0,,Arts & Social Sciences,,,R 5001 - R 6000,No,3-5,Only weekends,3-5,0,0,No,Yes,Very close
401,2024/03/12 11:05:33 am EET,Female,74.0,,Science,,Private accommodation/ stay with family/friends,,No,3-5,3,1-3,0,0,No,Yes,Close
403,2024/03/13 10:32:19 pm EET,Female,86.0,1st Year,,,Private accommodation/ stay with family/friends,R 4001- R 5000,No,5-8,2,3-5,1,0,No,Yes,Very close
404,2024/03/13 10:32:27 pm EET,Male,85.0,,Economic & Management Sciences,,Private accommodation/ stay with family/friends,R 4001- R 5000,No,1-3,4+,5-8,4+,0,No,Yes,Close


##### 4.1.3.8) Showing all rows with all NaN except in 'Timestamp' column

In [72]:
all_na = alcohol_df[alcohol_df.drop(columns='Timestamp', axis=1).isna().all(axis=1)]


In [74]:
all_na

Unnamed: 0,Timestamp,Your Sex?,Your Matric (grade 12) Average/ GPA (in %),What year were you in last year (2023) ?,What faculty does your degree fall under?,Your 2023 academic year average/GPA in % (Ignore if you are 2024 1st year student),Your Accommodation Status Last Year (2023),Monthly Allowance in 2023,Were you on scholarship/bursary in 2023?,Additional amount of studying (in hrs) per week,How often do you go out partying/socialising during the week?,"On a night out, how many alcoholic drinks do you consume?","How many classes do you miss per week due to alcohol reasons, (i.e: being hungover or too tired?)",How many modules have you failed thus far into your studies?,Are you currently in a romantic relationship?,Do your parents approve alcohol consumption?,How strong is your relationship with your parent/s?
224,2024/03/07 9:24:43 pm EET,,,,,,,,,,,,,,,,
232,2024/03/07 9:55:31 pm EET,,,,,,,,,,,,,,,,


In [None]:
alcohol_df.isna().all(axis=1)

#no_timestamp_df.columns