# 1. Data Cleaning

In [2]:
#Importing necessary libraries
import pandas as pd
import numpy as np

In [3]:
#Loading the Shark Attack dataset
df = pd.read_csv("/Users/clark/Desktop/ironhack/shark attack/attacks.csv", encoding="latin1", sep= ",")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/clark/Desktop/ironhack/shark attack/attacks.csv'

#### -> Renaming columns and dropping unnecessary columns

In [None]:
#Checking what columns we have
df.columns

In [None]:
#Checking how many rows we have 
df.shape

In [None]:
#Standadarzing column names
df.columns = df.columns.str.lower().str.replace(' ', '_').str.strip('_')
df.columns

---------------------------------------------------------------------------------------------
The following columns don't give us many insights for our prediction 
so we'll remove them from our data frame: 

'Investigator or Source', 'pdf', 'href formula', 'href',

'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22','Unnamed: 23'

-------------------------------------------------------------------------------------

In [None]:
#Dropping unnecessary columns
df.drop(['investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number.1', 'case_number.2', 'original_order', 'unnamed:_22',
       'unnamed:_23'], axis=1, inplace=True)
df.columns

In [None]:
df.info()

#### -> Checking for NaNs and standardizing each column if necessary

##### 1. Case Number

In [None]:
#Checking if we can drop all the rows in which case_number is null
df[df['case_number'].isnull()]

----------------------------------------------------------------------------------------------
Since the entire rows are made of NaNs when case_number is null we'll be dropping these rows as
they don't add any insights to our model
----------------------------------------------------------------------------------------------

In [None]:
#Drop the rows that all of them are Nans
df.dropna(how='all', inplace=True)
df['case_number'].isna().sum()

In [None]:
# We still have one null
df[df['case_number'].isnull()]

In [None]:
#Checking if when case_number is 0, all the other columns are also NaNs
df[df['case_number'].str.strip() == '0']

In [None]:
#Dropping all rows in which the columns are all null but in case_number
df.dropna(subset=df.columns.difference(['case_number']), how='all', inplace=True)
df[df['case_number'].str.strip() == '0']

--------------------------------------------------------------------------------------------------------
case_number is still having however weird values, such as dates and letters. 

To standardize 
this column we can replace it by the number of the row plus one (row_number + 1), so that all rows are unique

--------------------------------------------------------------------------------------------------------

In [None]:
#Replacing case_number with the row index + 1 (so the first case_number isn't 0 or NaN)
df['case_number'] = df.index + 1
df.head()

In [None]:
#Checking if case_number is now an int instead of object
df['case_number'].dtype

##### 2. Date

In [None]:
#Funtion that turns all values in date into actual dates
from dateutil import parser

def convert_to_datetime(df, column_name):
    def parse_date(row):
        try:
            # Try to parse the date using dateutil.parser
            parsed_date = parser.parse(row[column_name], dayfirst=True)
            
            # Check if the year is before 1677 
            #(The default range for pandas datetime objects is from 1677-09-21 00:12:43.145225 
            #to 2262-04-11 23:47:16.854775.),otherwise it returns out of bounds error
            if parsed_date.year < 1677:
                # If before 1677, return '01-01-1677'
                return '01-01-1678'
            
            # Extract only the date part from the parsed datetime object
            date_only = parsed_date.strftime('%d-%m-%Y')
            
            return date_only
        
        except Exception:
            # If there is an error in parsing, return a default value
            return '01-01-1900'  # Adjust the default value as per your requirement
    
    df[column_name] = df.apply(parse_date, axis=1)
    return df

In [None]:
#Converting all dates into actual dates using the previous function
df = convert_to_datetime(df, 'date')

In [None]:
#Checking row 4644 because it returns an error in the next cell due to this date
print(df.iloc[4644])
#Replacing the date for this specif row it its correct year
df.loc[4644, 'date'] = '22-07-1944'
print('----------------------')
print(df.iloc[4644])

In [None]:
#Converting column date to datetime
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
print(df['date'].dtype)
df.head()

##### 3. Year

In [None]:
# Replace non-finite values with a specific value (e.g., 1900) and converting column year from float to int
df['year'] = df['year'].replace([np.nan, 0], 1900)
df['year'] = df['year'].astype(int)
df['year'].dtype

df['year'].value_counts()

##### 4. Type

In [None]:
#Function that replaces the type with 'Sea Disaster' for types with the word 'Boat'
def replace_type(df, column_name):
    # Replace NaN values with 'Invalid'
    df[column_name] = df[column_name].fillna('Invalid')
    
    # Use string methods to check if 'boat' is present in 'type' column (ignore case)
    condition = df[column_name].str.contains('boat', case=False)
    
    # Replace values that meet the condition with 'Sea Disaster'
    df.loc[condition, column_name] = 'Questionable'
    
    return df

In [None]:
#Standadizing the types to have only: 'Unprovoked', 'Provoked', 'Invalid', 'Sea Disaster', 'Questionable'
df = replace_type(df, 'type')
df['type'].value_counts()

##### 5. Country

In [None]:
#Making sure all countries are set with upper case
df['country'] = df['country'].str.upper().str.replace(r'\W', ' ', regex=True).str.strip(' ')

In [None]:
# Set the display options to show all values without truncation
pd.set_option('display.max_rows', None)
df['country'].value_counts(dropna=False).sort_index()

--------------------------------------------------------------------------------------------------
Because we have some invalid countries (eg. 'BETWEEN PORTUGAL & INDIA', 'DIEGO GARCIA', ) and a few NaN
we'll check if these countris are on the all_countries pandas library and if not mark as 'UNKNOWN'

---------------------------------------------------------------------------------------------------

In [None]:
#Installing and importing geonamescache library to have a list of all countries 
#to compare with our column country
!pip install geonamescache
import geonamescache

gc = geonamescache.GeonamesCache()
all_countries = list(gc.get_countries_by_names().keys())

# Add 'Djibouti' to the list
all_countries.append('Djibouti')

#Turning all countries upper case so we can compare later with our country column
all_countries = [country.upper() for country in all_countries]
print(all_countries)

--------------------------------------------------------------------------------------------------------
The bellow function iterates over each row of the DataFrame using iterrows(). 

-> It checks if the value in the "country" column exists in the all_countries list:

    1. If it doesn't, it checks if the value in the "area" column contains a country that exists in the all_countries list or if country contains a country in the all_countries list. 

    2. If it does, it replaces the value in the "country" column with that extracted_country name using df.at[index, 'country'] = extracted_country.

    3. If the extracted_country is not found in all_countries, it replaces the country with the string 'UNKNOWN' using df.at[index, 'country'] = 'UNKNOWN'.

----------------------------------------------------------------------------------------------------------

In [None]:
#Importing regex
import re

#Function to replaces the country name 
#If a value doesn't exist, it will check if the country name appears in the "area" column. 
#If it does, it will replace the value with the country name.
def replace_country(df, country_column, area_column, all_countries):
    # Replace NaN values with 'Unknown'
    df[country_column] = df[country_column].fillna('UNKNOWN')
    
    #Iterating over each row of the DataFrame using iterrows()
    for index, row in df.iterrows():
        country = str(row[country_column])  # Ensure the value is a string
        area = str(row[area_column])  # Ensure the value is a string
        area = area.upper() #Ensuring it is in upper case like all_countries
        
        #Check if Country is "OKINAWA" or "CEYLON" replace with "JAPAN" and "SRI LANKA", respectively
        if country == 'OKINAWA':
            country = 'JAPAN'
        elif country == 'CEYLON':
            country = 'SRI LANKA'
        elif country == 'USA':
            country = 'UNITED STATES'
        
        # Check if any country name from all_countries exists in the area/country and extracts the country
        pattern = r"\b(" + "|".join(all_countries) + r")\b"
        match_country = re.search(pattern, country, flags=re.IGNORECASE)
        match_area = re.search(pattern, area, flags=re.IGNORECASE)
        
        if match_country:
            extracted_country = match_country.group(0)
        else:
            if match_area:
                extracted_country = match_area.group(0)
            else:
                extracted_country = None
        
        #Checking if the value in the "country" column exists in the all_countries list
        if country in all_countries:
            df.at[index, country_column] = country
        #If it doesn't, it checks if the value in the "area" column exists in the all_countries list
        else:
            if extracted_country in all_countries: 
                df.at[index, country_column] = extracted_country
            #Otherwise it replaces the country with the string 'UNKNOWN'
            else:
                df.at[index, country_column] = 'UNKNOWN'
    
    return df

In [None]:
#Replacing the countries with valid values using the previous function
df = replace_country(df,'country', 'area', all_countries)
pd.set_option('display.max_rows', None)
df['country'].value_counts(dropna=False).sort_index()

##### 6. Area

In [None]:
#Making sure all areas are set with upper case
df['area'] = df['area'].str.upper().str.replace(r'\W', ' ', regex=True).str.strip(' ')

# Set the display options to show all values without truncation
df['area'].value_counts(dropna=False).sort_index()

--------------------------------------------------------------------------------------------------------
Since we're having a lot of distinct values, we'll check:

    1. if the area cointains an existing region in the pycountry library and replace area with {region_name}, {country}. 

    2. if not we'll check if it has the word "North", "Central", "South", "East", "West" and change area for {North, Central, South, East, West},{COUNTRY}
--------------------------------------------------------------------------------------------------------

In [None]:
#Installing it
!pip install pycountry

#Importing a Python package that provides a comprehensive collection of country-related data. 
import pycountry

#Function that returns a list of regions for a given country
def get_cities_by_country(country_name):
    country_code = None
    subdivisions_list = []  # Initialize an empty list to store the subdivision names
    
    for country in pycountry.countries:
        if country.name == country_name:
            country_code = country.alpha_2
            break
    
    if country_code:
        for subdivision in pycountry.subdivisions.get(country_code=country_code):
            subdivisions_list.append(subdivision.name)  # Add subdivision name to the list
        
        return subdivisions_list  # Return the list of subdivision names
    else:
        return ''  # Return empty list if country not found

In [None]:
#Function that replaces the area with a valid region, 
#or area (North, Central, South, East, West) and country_name
def replace_area(df, country_column, area_column):
    #List of regions
    regions = []
    
    #Iterating over each row of the DataFrame using iterrows()
    for index, row in df.iterrows():
        # Ensure the value is a string and turns 'SRI LANKA' into 'Sri Lanka', for example
        country = str(row[country_column]).title()  
        # Ensure the value is a string
        area = str(row[area_column]).title()  
        
        regions = [city for city in get_cities_by_country(country)]
        
        extracted_region = None
        
        #In case there's a list of regions we extract the area
        if regions != []:
            # Check if area cointains a region in the list
            pattern = r"\b(" + "|".join(map(re.escape, regions)) + r")\b"
            match_area = re.search(pattern, area, flags=re.IGNORECASE)

            if match_area:
                extracted_region = match_area.group(0) + ', ' + country
            #If there are no matches in regions list, then we check if words in area are within a region
            #For example 'Veracruz' should match 'Veracruz de Ignacio de la Llave'
            else:
                pattern = r"\b" + re.escape(area) + r"\b"

                for region in regions:
                    if re.search(pattern, region, flags=re.IGNORECASE):
                        extracted_region = region + ', ' + country
                        break
            
            #In case there were no matches at all we check if there's key words like 
            #'north', 'south', 'west', 'east', 'central'
            if extracted_region == None:
                pattern = r"(north|south|west|east|central)"
                match_area = re.search(pattern, area, flags=re.IGNORECASE)

                if match_area:
                    extracted_region = match_area.group(0) + ', ' + country
    
        #Replacing the area with the extracted_region in case there was a match
        if extracted_region != None:
            df.at[index, area_column] = extracted_region
        else:
            df.at[index, area_column] = 'UNKNOWN'

    return df

In [None]:
#Replacing the areas with valid values using the previous function
df = replace_area(df,'country', 'area')
pd.set_option('display.max_rows', None)
df['area'].value_counts(dropna=False).sort_index()

##### 7. Location

-------------------------------------------------------------------------------------------------------
Location might not be a relevant column for us, so we'll drop eventually possibly

-----------------------------------------------------------------------------------------------------

##### 8. Activity

----------------------------------------------------------------------------------------------------------
For column 'activity' we'll divide it into categories: 

-> swimming: including any injury containing words 'swimming', 'swimm', 'bathing', 'dangling', 'floating'

-> diving: including words such as 'dive', 'diving', 'scuba', 'dived''

-> fishing: 'chase', 'catch','catching', 'attract', 'attracting','fishing', 'fish', 'retrieve', 'net', 'collecting', 'crabbing', 'crayfishing', 'feeding', 'rescuing', 'rescue','trap

-> water sport: 'board', 'canoe', 'surf', 'surfing', 'boarding', 'canoeing', 'paddle', 'paddling'

-> sailing: 'boat', 'sailing', 'sail', 'ship', 'sailboat', 'adrift', 'adrifting', 'conducting', 'cruise', 'cruising', 'anchor', 'escape', 'escaping', 'yacht','yachting'

-> air disaster: 'air', 'aircraft', 'crashed'

-> unknown: For anything else

---------------------------------------------------------------------------------------------------------

In [None]:
#Function that replaces the activity with its respective category
def replace_activity(df, activity_column):
    for index, row in df.iterrows():
        activity = str(row[activity_column]).lower()  # Convert to lowercase for case insensitivity
        
        
        # Check for specific keywords/categories using string matching
        if any(keyword in activity for keyword in ['swimming', 'swimm', 'bathing', 'dangling', 'floating']):
            df.at[index, activity_column] = 'swimming'

        elif any(keyword in activity for keyword in ['dive', 'diving', 'scuba', 'dived']):
            df.at[index, activity_column] = 'diving' 

        elif any(keyword in activity for keyword in ['chase', 'catch','catching', 'attract', 
                                                     'attracting','fishing', 'fish', 'retrieve', 'net', 
                                                     'collecting', 'crabbing', 'crayfishing', 'feeding', 
                                                     'rescuing', 'rescue','trap']):
            df.at[index, activity_column] = 'fishing'
        
        elif any(keyword in activity for keyword in ['board', 'canoe', 'surf', 'surfing', 'boarding', 
                                                     'canoeing', 'paddle', 'paddling']):
            df.at[index, activity_column] = 'water sport'
        
        elif any(keyword in activity for keyword in ['boat', 'sailing', 'sail', 'ship', 'sailboat', 
                                                     'adrift', 'adrifting', 'conducting', 'cruise', 
                                                     'cruising', 'anchor', 'escape', 'escaping', 'yacht',
                                                     'yachting']):
            df.at[index, activity_column] = 'sailing'
            
        elif any(keyword in activity for keyword in ['air', 'aircraft', 'crashed']):
            df.at[index, activity_column] = 'air disaster'

        else:
            df.at[index, activity_column] = 'unknown'
            
    return df

In [None]:
#Replacing the activities using the above function
df = replace_activity(df, 'activity')
df['activity'].value_counts(dropna=False).sort_index()

##### 9. Name

In [None]:
name = df['name'].value_counts()

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(name)

------------------------------------------------------------------------------------------------------
Like Location name might not be very relevant for our model, therefore we'll probably drop it too 

------------------------------------------------------------------------------------------------------

##### 10. Sex

In [None]:
#Replacing 'sex' column with 'm', 'f' and 'o'
df['sex'] = df['sex'].replace({'M': 'm', 'F': 'f', 'N': 'o', 'lli': 'o', '.': 'o', 'M ': 'm'})
df['sex'].value_counts()

##### 11. Age

----------------------------------------------------------------------------------------------------------
For age we'll divide all ges into categories such as:

-> child: 0-12

-> teenage: 13-17

-> young Adult: 18-30

-> adult: 30-49

-> middle-age: 50-64

-> elderly: 65+

-----------------------------------------------------------------------------------------------------------

In [None]:
#Function that replaces the ages whith its age category
def replace_age(age):
    if pd.isnull(age):  # Check for missing values
        return 'unknown'
    age = age.lower()  # Convert to lowercase for case insensitivity
    
    # Check for specific patterns using regular expressions
    if re.search(r'\d+', age):  # Check for any digit in the string
        age_number = int(re.findall(r'\d+', age)[0])  # Extract the first digit as the age number
        
        if re.search(r'mid-\d+s', age):  # Check for the 'mid-30s' or 'mid-20s' pattern
            age_number = age_number + 5  # Extract the number from the pattern and 5 more years
            if age_number >= 20 and age_number <= 30:
                return 'young-adult'
            elif age_number >= 30 and age_number <= 49:
                return 'adult'
            else:
                return 'unknown'
        
        elif age_number >= 0 and age_number <= 12:
            return 'child'
        elif age_number >= 13 and age_number <= 17:
            return 'teenager'
        elif age_number >= 18 and age_number <= 30:
            return 'young-adult'
        elif age_number >30 and age_number <= 49:
            return 'adult'
        elif age_number >= 50 and age_number <= 64:
            return 'middle-age'
        elif age_number >= 65:
            return 'elderly'
        else:
            return 'unknown'
    
    elif re.search(r'child', age):  # Check for the presence of 'child' in the string
        return 'child'
    
    elif re.search(r'teen', age):  # Check for the presence of 'teen' in the string
        return 'teenager'
    
    elif age.startswith('young'):
        return 'young-adult'
    
    elif re.search(r'adult', age):  # Check for the (adult)' pattern
        return 'adult'
    
    elif age.startswith('middle'):
        return 'middle-age'
    
    else:
        return 'unknown'
    

In [None]:
#Applying replace_age funtion to the 'age' column
df['age'] = df['age'].apply(replace_age)

# Print the updated DataFrame
df['age'].value_counts(dropna=False).sort_index()

##### 12. Injury

------------------------------------------------------------------------------------------------------
For column injury we'll also divide it into categories: 

-> injured: including any injury containing words 'injured', 'laceration', 'bite', 'wound', 'gash',
    'scratch', 'cut','bitten', 'mauled', 'teeth', 'recovered','serious', 'tooth', 'puncture', 'severed'

-> dead: including words such as 'perish', 'dead', 'death', 'body', 'bodies', 'lost', 'remains'

-> no injury: 'hoax', 'no injury', 'survived', 'survive'

-> unknown: For anything else

----------------------------------------------------------------------------------------------------------

In [None]:
#Fuction that replaces the injury with a set category given a keyword 
def replace_injury(df, injury_column):
    for index, row in df.iterrows():
        injury = str(row[injury_column]).lower()  # Convert to lowercase for case insensitivity

        # Check for specific keywords/categories using string matching
        if any(keyword in injury for keyword in ['injured', 'laceration', 'lacerated','bite', 'wound', 'gash', 'scratch', 
                                                   'injuries', 'injury','cut', 'bitten', 'mauled', 'teeth', 'recovered', 'serious', 
                                                   'tooth', 'puncture', 'severed']):
            if any(keyword in injury for keyword in ['no injury']):
                df.at[index, injury_column] = 'no injury'
            elif any(keyword in injury for keyword in ['perish', 'perished', 'dead', 'death', 'body', 'bodies', 'lost', 'remains', 'died']):
                df.at[index, injury_column] = 'dead'
            else:
                df.at[index, injury_column] = 'injured'

        elif any(keyword in injury for keyword in ['hoax', 'no injury','survived', 'survive']):
            if any(keyword in injury for keyword in ['perish', 'perished','dead', 'death', 'body', 'bodies', 'lost', 'remains', 'died']):
                df.at[index, injury_column] = 'dead'
            else:
                df.at[index, injury_column] = 'no injury'

        elif any(keyword in injury for keyword in ['perish', 'dead', 'death', 'body', 'bodies', 'lost', 'remains', 'died']):
            df.at[index, injury_column] = 'dead'

        else:
            df.at[index, injury_column] = 'unknown'
            
    return df

In [None]:
#Applying the previous function to the 'injury' column
df = replace_injury(df, 'injury')
df['injury'].value_counts()

##### 13. Fatal

In [None]:
#Renaming column fatalas we are adding 'u' for unknown
df = df.rename(columns={"fatal_(y/n)": "fatal(y/n/u)"})

In [None]:
df['fatal(y/n/u)'].value_counts()

In [None]:
#Replacing values with a valid value
df['fatal(y/n/u)'] = df['fatal(y/n/u)'].replace({'UNKNOWN': 'u', 'M': 'u', ' N': 'n', '2017': 'u', 'N ': 'n', 'Y': 'y', 'N': 'n'})

In [None]:
df['fatal(y/n/u)'].unique()

In [None]:
#Filling the NaNs with 'u'
df.fillna('u', inplace=True)

In [None]:
df['fatal(y/n/u)'].value_counts()

##### 14. Time

-------------------------------------------------------------------------------------
For column time, we'll split into the following categories:

Morning, Midday, Afternoon, Evening and Night

------------------------------------------------------------------------------------------------

In [None]:
#Fuction that replaces the time with its category
def standarize_time(value):
    if pd.isna(value) or value == '--' or value == '':
        return 'Unknown'
    elif 'h' in value:
        try:
            hour = int(value.split('h')[0])
            if 6 <= hour < 10:
                return 'Morning'
            elif 10 <= hour < 14:
                return 'Midday'
            elif 14 <= hour < 17:
                return 'Afternoon'
            elif 17 <= hour < 21:
                return 'Evening'
            else:
                return 'Night'
        except ValueError:
            return 'Unknown'
    elif 'Morning' in value or 'AM' in value:
        return 'Morning'
    elif 'Midday' in value:
        return 'Midday'
    elif 'Afternoon' in value or 'PM' in value:
        return 'Afternoon'
    elif 'Night' in value or 'Evening' in value:
        return 'Night'
    else:
        return 'Unknown'

In [None]:
#Applying the previous function to time
df['time'] = df['time'].apply(standarize_time)

In [None]:
df['time'].value_counts()

##### 15. Species

In [None]:
#Replacing the NaN with 'unknown'
df['species'].fillna("unknown", inplace=True)
df.loc[~df['species'].str.contains('shark', case=False), 'species'] = 'unknown'

-------------------------------------------------------------------------------------------------------
The Function bellow checks:

    1. if the value contains the word 'shark'
    
        1.1 if it does then we fetch the word before shark and shark and save it as the species
        1.2 if there's no other word before shark then we mark as 'unknown'
        1.3 if the string value doesn't contain the word 'shark' then we mark as 'unknown'
        
--------------------------------------------------------------------------------------------------------

In [None]:
#Fuction that fetches the species from the strings that compose column species
def replace_species(df, species_column):
    for index, row in df.iterrows():
        species = str(row[species_column]).lower() # Convert to lowercase for case insensitivity
        
        #Clearing numbers from the string
        species = re.sub(r'\d+', '', species)
    
        if 'shark' in species:
            # Retrieve the word before 'shark' and the word 'shark' itself
            excluded_words = ['small', 'large', 'big', 'another', 'from', 'foot', 
                              'same', 'two', 'young', 'old', 'female', 'male']
            pattern = r'(\b(?!(?:{}|\d+)\b)\w{{3,}}\s+shark\b)'.format('|'.join(excluded_words))
            match = re.search(pattern, species, re.IGNORECASE)
            if match:
                df.at[index, species_column] = match.group(1)
            else:
                df.at[index, species_column] = 'unknown'
        else:
            # Replace species without the word 'shark' with 'unknown'
            df.at[index, species_column] = 'unknown'
        
    
    return df

In [None]:
#Applying the above function to replace the species column with valid values
df = replace_species(df, 'species')

df['species'].value_counts()

-------------------------------------------------------------------------------------------------------
Before doing the logistic regression and data analysis we are going to do some filtering for the countries and species:

    1. Countries: we are going to filter it by the top 10 countries that had the most shark attacks
    2. Species: we are going to filter out the species that only have 1 count
        
--------------------------------------------------------------------------------------------------------

In [None]:
df1 = df.copy()

**Countries**

Creating a filter for countries

In [None]:
df1['country'].value_counts()

In [None]:
#The top 10 countries are the following: United States, Australia, South Africa, Papua New Guinea, New Zealand, Brazil, Bahamas, Mexico, Italy and Unknown

top_10 = ['UNITED STATES', 'AUSTRALIA', 'SOUTH AFRICA', 'UNKNOWN', 'PAPUA NEW GUINEA', 'NEW ZEALAND', 'BRAZIL', 'BAHAMAS', 'MEXICO', 'ITALY']
df1_top_10 = df1[df1['country'].isin(top_10)]

**Species**

Creating a filter for species

In [None]:
#We are going to filter out all the type of species that the count is under 5

species_count = df1['species'].value_counts()
species_attacks = species_count[species_count > 4].index
df1_species_attacks = df1[df1['species'].isin(species_attacks)]

**Combine both**

In [None]:
df2 = df1.copy()

In [None]:
df2 = df1_top_10[df1_top_10['species'].isin(species_attacks)]

In [None]:
df2['country'].value_counts()

In [None]:
df2['species'].value_counts()

In [None]:
#Dropping columns location and name as they wont't be relevant for our model
df_filtered = df2.drop(['case_number','location','name', 'date'], axis=1, inplace=True)

df_filtered