# Importing modules, loading file

In [1]:
import pandas as pd

In [2]:
url = 'https://www.sharkattackfile.net/spreadsheets/GSAF5.xls'
pd.set_option('display.max_columns', None)
df = pd.read_excel(url)

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 2.0.1 for xls Excel support Use pip or conda to install xlrd.

In [None]:
df

# Data Cleaning

In [None]:
# 1. Lose PDF, source, location (and more) columns✅
# 2. Handle all the null✅
    # Year -> filling missing/value=0 with data from "Date" column
    # Year -> keeping only data from the last 77 years (75quantile)
    # Time and age replace by avg✅
# Delete rows with a majority of nulls✅
# 4.Check for duplicates✅
# 3.Format Data
# Creatign new column with "Month"


In [None]:
#1. Lose PDF, source, location (and more) columns

columns_to_drop = ['Location', 'pdf', 'Source','href formula','Unnamed: 11','href', 'Case Number', 'Case Number.1',
       'original order', 'Unnamed: 21', 'Unnamed: 22']
df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
# "Year"
# filling missing values (NaN and "0") with data from "Date"

print(f"Starting this process there were {sum(df['Year'] == 0.0)} values of 'Year' = 0.\n"
         f"And 'Year' had {df['Year'].isnull().sum()} NaN values.")

print(df['Year'].describe())

#pd.set_option('display.max_rows', 200)

# print(df[df["Year"]==0.0]["Date"]) # there are entries with year 0, but info at "Date"

df["Date"] = df["Date"].apply(str) # transforming the "Date" to string to be able to manipulate it

substring  = ["BC", "B.C."] # deleting the rows that are B.C.
filter = df["Date"].str.contains('|'.join(substring))
df = df[~filter]

df = df[~((df["Date"] == "No date") & (df["Year"] == 0.0))] # deleting rows that have year=0 and date=no date

# adding to year (when year = 0) the 4 consecutive digids from Date
df.loc[df['Year'] == 0.0, 'Year'] = df.loc[df['Year'] == 0.0, 'Date'].str.extract(r'(\d{4})').values

# adding to year (when year = NaN) the 4 consecutive digids from Date
df.loc[df['Year'].isna(), 'Year'] = df.loc[df['Year'].isna(), 'Date'].str.extract(r'(\d{4})').values

print(f"Finishing this process there are {sum(df['Year'] == 0.0)} values of 'Year' = 0.\n"
        f"And 'Year' has {df['Year'].isnull().sum()} NaN values.\n"
        "The missing values were filled in from info from the 'Date' column.")



In [None]:
# "Year"
# keeping only data from the last 77 years (75quantile)

print(f"Starting this process the dataframe has {df.shape[0]} rows.")

df.dropna(subset="Year",inplace=True) # deleting 3 rows with NaN yesr value ->looking at the data they also look to be around the world wars

df['Year'] = df['Year'].astype(int)

df = df[df['Year'] >= 1947]

print(f"After this process the dataframe has {df.shape[0]} rows.")

In [None]:
#2. Handle all the null
df.isnull().sum()
df.isnull().any()
df.isnull().sum(axis=1)
df.dropna(how='all', inplace=True) #drop all rows with all of the 13 values null (-24 rows)

# Convert 'Age' column to numeric type to ensure all values are numeric
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
# Replace null values in the 'Age' column with the average age
df['Age'].fillna(df['Age'].mean(), inplace=True)

df['Time'] = pd.to_numeric(df['Time'], errors='coerce')
df['Time'].fillna(df['Time'].mean(), inplace=True)
df.dropna(subset=['Country'], inplace=True) #drop because we have only 50 NaN
df.dropna(subset=['Type'], inplace=True) #drop because we have only 16 NaN
df.dropna(subset=['Injury'], inplace=True) #drop because we have only 25 NaN
#df.dropna(subset=['Year'], inplace=True) #drop because we have only 2 NaN
#df.dropna(thresh=10)#with 13 we have 3054 rows, 12 >> 5773, 11 >> 6555

In [None]:
df.isnull().sum()

In [None]:
unique_values_per_column = {}
for col in df.columns:
    unique_values_per_column[col] = df[col].unique()
    
unique_values_per_column

In [None]:
df['Sex'].value_counts()

In [None]:
df.drop(df[~df['Sex'].isin(['M', 'F'])].index, inplace=True)
#This code filters the DataFrame to retain only rows where the 'Sex' column contains values 'M' or 'F', 
#effectively removing rows with other values from the DataFrame.

In [None]:
df['State'].fillna('undefined', inplace=True)
#This will replace all non-values in the 'state' column with the word 'undefined'
#because we don't need the states for our research and we can still use other values from the same row

In [None]:
df['Species '].fillna('unknown', inplace=True)
#change all the non values of the species
#to 'unknow' because if we dropped them we would lose a significant part of our sample, which could
#compromise the credibility of our research.

In [None]:
df['Activity'].fillna('unknown', inplace=True)

In [None]:
#NaN values 'Name'

def generate_anonymous_label(counter):
    return f'anonymous{counter}'

# Initialize a counter
counter = 1

# Fill missing values in 'name' column with anonymous labels
df['Name'].fillna(value=lambda x: generate_anonymous_label(counter), inplace=True)

# Increment counter for next iteration
counter += 1

#Here we replace all the NaN values in the names with Anonimos1, 2, 3 etc. 
#Because we don't need people's names for our research
# And we can still use the information in the other columns

In [None]:
# Date -> new column Month

df['Month'] = df['Date'] # creating a new column for "Month"

# for date inputs that have seasons, adding the middle month of the season
df['Month'] = df['Month'].replace({"Fall":"October","Summer":"July","summer":"July","Nox":"November","Winter":"January","2017.06.05":"June","2008.01.30":"January","02-Ap-2001":"April"},regex=True)


# removing words in "Date" that don't offer much info, anf will mess up with the r pattern later
substring = ["Early","Reported","Before","Mid","Between","date","and","Late"]
for sub in substring:
    df['Month'] = df['Month'].str.replace(sub, '')
    
df['Month'] = df['Month'].str.extract(r'([A-Za-z]{3})')

df['Month'].isna().sum() # these NaN values, are mostly dates that had only year ( df.loc[df['Month'].isna()] )

# replacing the values are not months (checked) with NaN values
df['Month'] = df['Month'].replace({"sam":None,"Las":None,"Cir":None,"Pri":None,"Aft":None,"lat":None},regex=True)

df['Month'].value_counts() # July and August are the most active months

df.dropna(subset = "Month",inplace=True) # dropping NaN values, around 100+

print(f"There are {df['Month'].isnull().sum()} NaN values in 'Month'.")

In [None]:
print(df.duplicated().sum()) # no duplicated values
df.drop_duplicates(inplace=True)

# Data Formatting

In [None]:
#Formatting Data:

#Use round() and format() to format numeric values.
#Use f-strings, format() or % to format strings and use string methods like lower(), upper(), title(), strip(), split(), and replace().
#Cleaning Column Names:

#Use df.columns to access column names.
#Modify column names using df.columns or rename()

In [None]:
lowercase_df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
lowercase_df
#not yet applied to df, new variables
capitalized_df = df.applymap(lambda x: x.capitalize() if isinstance(x, str) else x)
capitalized_df

In [None]:
df.shape[0]

In [None]:
df.isnull().sum()