In [1]:
import pandas as pd
import numpy as np

In [25]:
# Load the dataset

print(""*60)
print("AFRICA CUP OF NATIONS MATCHES DATA ANALYSIS")
print(""*60)


AFRICA CUP OF NATIONS MATCHES DATA ANALYSIS



In [47]:
# 1. Read the CSV file
print("\n Reading the CSV file...")
df = pd.read_csv('AfricaCupofNationsMatches.csv')  # Using the provided file
print("✓ Dataset loaded successfully!")


 Reading the CSV file...
✓ Dataset loaded successfully!


In [48]:
# 2. Get the first 7 rows
print("\n First 7 rows of the dataset:")
print(""*50)
print(df.head(7))


 First 7 rows of the dataset:

   Year      Date  Time    HomeTeam       AwayTeam  HomeTeamGoals  \
0  1957  10-Feb-57   NaN     Sudan           Egypt            1.0   
1  1957  10-Feb-57   NaN  Ethiopia    South Africa            NaN   
2  1957  16-Feb-57   NaN     Egypt        Ethiopia            4.0   
3  1959  22-May-59   NaN      Egypt       Ethiopia            4.0   
4  1959  25-May-59   NaN     Sudan        Ethiopia            1.0   
5  1959  29-May-59   NaN      Egypt          Sudan            2.0   
6  1962  14-Jan-62   NaN  Ethiopia         Tunisia            4.0   

   AwayTeamGoals             Stage  \
0            2.0        Semifinals   
1            NaN        Semifinals   
2            0.0             Final   
3            0.0  Final Tournament   
4            0.0  Final Tournament   
5            1.0  Final Tournament   
6            2.0        Semifinals   

                                SpecialWinConditions                 Stadium  \
0                             

In [49]:
# 3. Select specific columns
print("\n Selecting 'HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals' columns:")
print(""*70)
selected_cols = df[['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']]
print(selected_cols.head(10))


 Selecting 'HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals' columns:

    HomeTeam       AwayTeam  HomeTeamGoals  AwayTeamGoals
0     Sudan           Egypt            1.0            2.0
1  Ethiopia    South Africa            NaN            NaN
2     Egypt        Ethiopia            4.0            0.0
3      Egypt       Ethiopia            4.0            0.0
4     Sudan        Ethiopia            1.0            0.0
5      Egypt          Sudan            2.0            1.0
6  Ethiopia         Tunisia            4.0            2.0
7      Egypt         Uganda            2.0            1.0
8   Tunisia          Uganda            3.0            0.0
9  Ethiopia           Egypt            4.0            2.0


In [50]:

# 4. Select rows where Egypt appears (in either HomeTeam or AwayTeam)
print("\n Rows where Egypt appears:")
print(""*40)
egypt_matches = df[(df['HomeTeam'].str.contains('Egypt', na=False)) | 
                   (df['AwayTeam'].str.contains('Egypt', na=False))]
print(f"Found {len(egypt_matches)} matches involving Egypt")
print(egypt_matches[['Date ', 'HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']].head())


 Rows where Egypt appears:

Found 95 matches involving Egypt
       Date  HomeTeam   AwayTeam  HomeTeamGoals  AwayTeamGoals
0  10-Feb-57   Sudan       Egypt            1.0            2.0
2  16-Feb-57   Egypt    Ethiopia            4.0            0.0
3  22-May-59    Egypt   Ethiopia            4.0            0.0
5  29-May-59    Egypt      Sudan            2.0            1.0
7  18-Jan-62    Egypt     Uganda            2.0            1.0


In [51]:
# 5. Count rows and columns
print("\n Dataset dimensions:")
print(""*25)
rows, cols = df.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")


 Dataset dimensions:

Number of rows: 622
Number of columns: 12


In [52]:
# 6. Select rows where 'Attendance' is missing
print("\n Rows with missing attendance:")
print(""*35)
missing_attendance = df[df['Attendance'].isna()]
print(f"Found {len(missing_attendance)} rows with missing attendance")
print(missing_attendance[['Date ', 'HomeTeam', 'AwayTeam', 'Stadium', 'Attendance']].head())


 Rows with missing attendance:

Found 100 rows with missing attendance
        Date    HomeTeam       AwayTeam                 Stadium  Attendance
1   10-Feb-57  Ethiopia    South Africa                     NaN         NaN
8   20-Jan-62   Tunisia          Uganda  Hailé Sélassié Stadium         NaN
9   21-Jan-62  Ethiopia           Egypt  Hailé Sélassié Stadium         NaN
10  24-Nov-63     Ghana         Tunisia    Accra Sports Stadium         NaN
11  26-Nov-63     Ghana        Ethiopia    Accra Sports Stadium         NaN


In [53]:
# 7. Select rows where HomeTeamGoals are between 3 and 6 (inclusive)
print("\n Rows where HomeTeamGoals are between 3 and 6:")
print(""*50)
goals_3_to_6 = df[(df['HomeTeamGoals'] >= 3) & (df['HomeTeamGoals'] <= 6)]
print(f"Found {len(goals_3_to_6)} matches")
print(goals_3_to_6[['Date ', 'HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']].head())


 Rows where HomeTeamGoals are between 3 and 6:

Found 138 matches
       Date    HomeTeam   AwayTeam  HomeTeamGoals  AwayTeamGoals
2  16-Feb-57     Egypt    Ethiopia            4.0            0.0
3  22-May-59      Egypt   Ethiopia            4.0            0.0
6  14-Jan-62  Ethiopia     Tunisia            4.0            2.0
8  20-Jan-62   Tunisia      Uganda            3.0            0.0
9  21-Jan-62  Ethiopia       Egypt            4.0            2.0


In [54]:
# 8. Change AwayTeamGoals in 3rd row to 10
print("\n Changing AwayTeamGoals in 3rd row to 10:")
print(""*45)
print("Before change:")
print(df.iloc[2][['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']])
df_modified = df.copy()  # Create a copy to avoid modifying original
df_modified.loc[2, 'AwayTeamGoals'] = 10
print("After change:")
print(df_modified.iloc[2][['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']])


 Changing AwayTeamGoals in 3rd row to 10:

Before change:
HomeTeam            Egypt 
AwayTeam          Ethiopia
HomeTeamGoals          4.0
AwayTeamGoals          0.0
Name: 2, dtype: object
After change:
HomeTeam            Egypt 
AwayTeam          Ethiopia
HomeTeamGoals          4.0
AwayTeamGoals         10.0
Name: 2, dtype: object


In [55]:
# 9. Sort DataFrame by HomeTeam (ascending) then by HomeTeamGoals (descending)
# Note: The requirement mentions 'HomeTeamScores' but the column is 'HomeTeamGoals'
print("\n Sorting by HomeTeam (asc) then HomeTeamGoals (desc):")
print(""*55)
df_sorted = df.sort_values(['HomeTeam', 'HomeTeamGoals'], ascending=[True, False])
print(df_sorted[['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']].head(10))


 Sorting by HomeTeam (asc) then HomeTeamGoals (desc):

     HomeTeam      AwayTeam  HomeTeamGoals  AwayTeamGoals
205  Algeria        Nigeria            5.0            1.0
29   Algeria         Uganda            4.0            0.0
135  Algeria         Guinea            3.0            2.0
164  Algeria         Malawi            3.0            0.0
171  Algeria          Egypt            3.0            1.0
208  Algeria    Ivory Coast            3.0            0.0
331  Algeria          Gabon            3.0            1.0
611  Algeria         Guinea            3.0            0.0
138  Algeria          Egypt            2.0            2.0
150  Algeria        Nigeria            2.0            1.0


In [56]:
# 10. Get list of column headers
print("\n List of DataFrame column headers:")
print(""*40)
column_list = df.columns.tolist()
print(column_list)




 List of DataFrame column headers:

['Year', 'Date ', 'Time ', 'HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals', 'Stage', 'SpecialWinConditions', 'Stadium', 'City', 'Attendance']


In [57]:
# 11. Append a column of choice
print("\n Adding a new column 'TotalGoals':")
print(""*40)
df_with_new_col = df.copy()
df_with_new_col['TotalGoals'] = df_with_new_col['HomeTeamGoals'].fillna(0) + df_with_new_col['AwayTeamGoals'].fillna(0)
print("New column added successfully!")
print(df_with_new_col[['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals', 'TotalGoals']].head())



 Adding a new column 'TotalGoals':

New column added successfully!
    HomeTeam       AwayTeam  HomeTeamGoals  AwayTeamGoals  TotalGoals
0     Sudan           Egypt            1.0            2.0         3.0
1  Ethiopia    South Africa            NaN            NaN         0.0
2     Egypt        Ethiopia            4.0            0.0         4.0
3      Egypt       Ethiopia            4.0            0.0         4.0
4     Sudan        Ethiopia            1.0            0.0         1.0


In [59]:
# 12. Add 2 rows to DataFrame
print("\n Adding 2 new rows to DataFrame:")
print(""*40)
new_rows = pd.DataFrame({
    'Year': [2025, 2025],
    'Date ': ['01-Jan-25', '02-Jan-25'],
    'Time ': ['', ''],
    'HomeTeam': ['Nigeria', 'Morocco'],
    'AwayTeam': ['Algeria', 'Tunisia'],
    'HomeTeamGoals': [2, 1],
    'AwayTeamGoals': [1, 3],
    'Stage': ['Group A', 'Group B'],
    'SpecialWinConditions': ['', ''],
    'Stadium': ['Test Stadium', 'Test Stadium 2'],
    'City': ['Lagos', 'Rabat'],
    'Attendance': [50000, 45000]
})

df_with_new_rows = pd.concat([df, new_rows], ignore_index=True)
print(f"Original rows: {len(df)}")
print(f"After adding 2 rows: {len(df_with_new_rows)}")
print("Last 3 rows:")
print(df_with_new_rows.tail(3)[['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']])







 Adding 2 new rows to DataFrame:

Original rows: 622
After adding 2 rows: 624
Last 3 rows:
     HomeTeam  AwayTeam  HomeTeamGoals  AwayTeamGoals
621  Senegal    Algeria            0.0            1.0
622   Nigeria   Algeria            2.0            1.0
623   Morocco   Tunisia            1.0            3.0


In [60]:
# 13. Change 'Uganda' to 'China' in AwayTeam column
print("\n Changing 'Uganda' to 'China' in AwayTeam column:")
print(""*50)
df_country_changed = df.copy()
uganda_count_before = (df_country_changed['AwayTeam'] == 'Uganda').sum()
df_country_changed['AwayTeam'] = df_country_changed['AwayTeam'].replace('Uganda', 'China')
china_count_after = (df_country_changed['AwayTeam'] == 'China').sum()
print(f"Changed {uganda_count_before} instances of 'Uganda' to 'China'")
print("Sample rows with China:")
china_matches = df_country_changed[df_country_changed['AwayTeam'] == 'China']
print(china_matches[['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals']].head())


 Changing 'Uganda' to 'China' in AwayTeam column:

Changed 0 instances of 'Uganda' to 'China'
Sample rows with China:
Empty DataFrame
Columns: [HomeTeam, AwayTeam, HomeTeamGoals, AwayTeamGoals]
Index: []


In [38]:
# 14. Reset index
print("\nResetting DataFrame index:")
print(""*35)
df_reset_index = df.copy()
df_reset_index = df_reset_index.reset_index(drop=True)
print("Index reset successfully!")
print("First few rows with new index:")
print(df_reset_index.head(3))



Resetting DataFrame index:

Index reset successfully!
First few rows with new index:
   Year      Date  Time    HomeTeam       AwayTeam  HomeTeamGoals  \
0  1957  10-Feb-57   NaN     Sudan           Egypt            1.0   
1  1957  10-Feb-57   NaN  Ethiopia    South Africa            NaN   
2  1957  16-Feb-57   NaN     Egypt        Ethiopia            4.0   

   AwayTeamGoals       Stage  \
0            2.0  Semifinals   
1            NaN  Semifinals   
2            0.0       Final   

                                SpecialWinConditions            Stadium  \
0                                                NaN  Municipal Stadium   
1  Ethiopia  wins due to disqualification of othe...                NaN   
2                                                NaN  Municipal Stadium   

        City  Attendance  
0   Khartoum     30000.0  
1        NaN         NaN  
2   Khartoum     30000.0  


In [39]:
# 15. Check if 'Stadium' column is present
print("\n Checking if 'Stadium' column exists:")
print(""*40)
stadium_exists = 'Stadium' in df.columns
print(f"'Stadium' column exists: {stadium_exists}")
if stadium_exists:
    print("Stadium column info:")
    print(f"  Non-null values: {df['Stadium'].notna().sum()}")
    print(f"  Null values: {df['Stadium'].isna().sum()}")




 Checking if 'Stadium' column exists:

'Stadium' column exists: True
Stadium column info:
  Non-null values: 618
  Null values: 4


In [40]:
# 16. Convert AwayTeamGoals datatype from int to float
print("\n. Converting AwayTeamGoals from int to float:")
print(""*50)
df_converted = df.copy()
print(f"Original dtype: {df_converted['AwayTeamGoals'].dtype}")
df_converted['AwayTeamGoals'] = df_converted['AwayTeamGoals'].astype(float)
print(f"New dtype: {df_converted['AwayTeamGoals'].dtype}")





. Converting AwayTeamGoals from int to float:

Original dtype: float64
New dtype: float64


In [41]:
# 17. Remove last 10 rows
print("\nRemoving last 10 rows from DataFrame:")
print(""*45)
df_trimmed = df.copy()
original_length = len(df_trimmed)
df_trimmed = df_trimmed.iloc[:-10]
new_length = len(df_trimmed)
print(f"Original length: {original_length}")
print(f"After removing 10 rows: {new_length}")
print(f"Rows removed: {original_length - new_length}")



Removing last 10 rows from DataFrame:

Original length: 622
After removing 10 rows: 612
Rows removed: 10


In [42]:
# 18. Iterate over rows
print("\n Iterating over first 5 rows (sample):")
print(""*45)
for index, row in df.head(5).iterrows():
    print(f"Row {index}: {row['HomeTeam']} vs {row['AwayTeam']} ({row['HomeTeamGoals']}-{row['AwayTeamGoals']})")





 Iterating over first 5 rows (sample):

Row 0: Sudan  vs  Egypt (1.0-2.0)
Row 1: Ethiopia  vs  South Africa (nan-nan)
Row 2: Egypt  vs  Ethiopia (4.0-0.0)
Row 3: Egypt vs  Ethiopia (4.0-0.0)
Row 4: Sudan  vs  Ethiopia (1.0-0.0)


In [43]:
# 19. Change column order
print("\n. Changing DataFrame column order:")
print(""*40)
# Define new column order
new_column_order = ['Year', 'Date ', 'HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals', 
                   'Stage', 'Stadium', 'City', 'Attendance', 'Time ', 'SpecialWinConditions']
df_reordered = df[new_column_order]
print("New column order:")
print(df_reordered.columns.tolist())
print("Sample data with new order:")
print(df_reordered.head(3))



. Changing DataFrame column order:

New column order:
['Year', 'Date ', 'HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals', 'Stage', 'Stadium', 'City', 'Attendance', 'Time ', 'SpecialWinConditions']
Sample data with new order:
   Year      Date    HomeTeam       AwayTeam  HomeTeamGoals  AwayTeamGoals  \
0  1957  10-Feb-57     Sudan           Egypt            1.0            2.0   
1  1957  10-Feb-57  Ethiopia    South Africa            NaN            NaN   
2  1957  16-Feb-57     Egypt        Ethiopia            4.0            0.0   

        Stage            Stadium       City  Attendance Time   \
0  Semifinals  Municipal Stadium   Khartoum     30000.0   NaN   
1  Semifinals                NaN        NaN         NaN   NaN   
2       Final  Municipal Stadium   Khartoum     30000.0   NaN   

                                SpecialWinConditions  
0                                                NaN  
1  Ethiopia  wins due to disqualification of othe...  
2                          

In [46]:
# 20. Delete rows where HomeTeamGoals is 0
print("\n Deleting rows where HomeTeamGoals is 0:")
print(""*45)
df_no_zero_goals = df.copy()
zero_goals_count = (df_no_zero_goals['HomeTeamGoals'] == 0).sum()
df_no_zero_goals = df_no_zero_goals[df_no_zero_goals['HomeTeamGoals'] != 0]
print(f"Rows with HomeTeamGoals = 0: {zero_goals_count}")
print(f"Original DataFrame length: {len(df)}")
print(f"After removing zero-goal rows: {len(df_no_zero_goals)}")
print(f"Rows deleted: {len(df) - len(df_no_zero_goals)}")

print("\n" + ""*60)
print("ANALYSIS COMPLETE!")
print(""*60)

# Additional summary statistics
print("\n BONUS: Dataset Summary Statistics")
print(""*40)
print(f"Total matches in dataset: {len(df)}")
print(f"Years covered: {df['Year'].min()} - {df['Year'].max()}")
print(f"Unique teams (Home): {df['HomeTeam'].nunique()}")
print(f"Unique teams (Away): {df['AwayTeam'].nunique()}")
print(f"Average goals per match: {(df['HomeTeamGoals'].fillna(0) + df['AwayTeamGoals'].fillna(0)).mean():.2f}")
print(f"Highest scoring match: {(df['HomeTeamGoals'].fillna(0) + df['AwayTeamGoals'].fillna(0)).max()} goals")

# Most frequent teams
print(f"\nMost active teams (as home team):")
print(df['HomeTeam'].value_counts().head(5))



 Deleting rows where HomeTeamGoals is 0:

Rows with HomeTeamGoals = 0: 128
Original DataFrame length: 622
After removing zero-goal rows: 494
Rows deleted: 128


ANALYSIS COMPLETE!


 BONUS: Dataset Summary Statistics

Total matches in dataset: 622
Years covered: 1957 - 2019
Unique teams (Home): 41
Unique teams (Away): 46
Average goals per match: 2.45
Highest scoring match: 12.0 goals

Most active teams (as home team):
HomeTeam
Ghana           58
Cameroon        55
Nigeria         54
Egypt           50
Ivory Coast     46
Name: count, dtype: int64
