In [43]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [44]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


Here we install the dependancies that are pandas and sqlite3.   

Then we load the two csv files using the pandas function 'read_csv' and store them into two variables 
"deliveries" and "matches".

In [45]:
import pandas as pd
import sqlite3

deliveries = pd.read_csv('deliveries.csv')
matches = pd.read_csv('matches.csv')

Here we are trying to connect to the SQLite database "cricket_data", and we store it in the variable "connect_db". In order to do that, we first use sqlite3's 'connect' function, that will check whether a file named "cricket_data.db" exists, if it's the case then it will connect to it; else it will create a new database file named like that. 

Then we create a cursor object in order to execute SQL commands to the database.

In [46]:
connect_db = sqlite3.connect('cricket_data.db')
cursor = connect_db.cursor()

In [47]:
deliveries.to_sql('deliveries', connect_db, if_exists='replace', index=False)
matches.to_sql('matches',connect_db,if_exists='replace',index=False)

636

In [48]:
connect_db.commit()
# connect_db.close()

In [49]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when

# Créer une session Spark
spark = SparkSession.builder \
    .appName("Check NULLs in CSV files") \
    .getOrCreate()

# Charger les fichiers CSV dans des DataFrames Spark
deliveries_df = spark.read.csv('deliveries.csv', header=True, inferSchema=True)
matches_df = spark.read.csv('matches.csv', header=True, inferSchema=True)


# Fonction pour compter les valeurs NULL dans chaque colonne d'un DataFrame
def count_nulls(df):
    null_counts = {}
    for column in df.columns:
        # Compter les valeurs NULL
        null_count = df.filter(col(column).isNull()).count()
        
        # Stocker les résultats
        null_counts[column] = null_count
    return null_counts

# Compter les valeurs NULL pour le DataFrame deliveries
deliveries_null_counts = count_nulls(deliveries_df)

# Compter les valeurs NULL pour le DataFrame matches
matches_null_counts = count_nulls(matches_df)

# Afficher les résultats pour le fichier deliveries.csv
print("Résultats pour 'deliveries.csv':")
for column, count in deliveries_null_counts.items():
    print(f"Colonne '{column}': {count} NULL")

# Afficher les résultats pour le fichier matches.csv
print("\nRésultats pour 'matches.csv':")
for column, count in matches_null_counts.items():
    print(f"Colonne '{column}': {count} NULL")

# Fermer la session Spark
spark.stop()

Résultats pour 'deliveries.csv':
Colonne 'match_id': 0 NULL
Colonne 'inning': 0 NULL
Colonne 'batting_team': 0 NULL
Colonne 'bowling_team': 0 NULL
Colonne 'over': 0 NULL
Colonne 'ball': 0 NULL
Colonne 'batsman': 0 NULL
Colonne 'non_striker': 0 NULL
Colonne 'bowler': 0 NULL
Colonne 'is_super_over': 0 NULL
Colonne 'wide_runs': 0 NULL
Colonne 'bye_runs': 0 NULL
Colonne 'legbye_runs': 0 NULL
Colonne 'noball_runs': 0 NULL
Colonne 'penalty_runs': 0 NULL
Colonne 'batsman_runs': 0 NULL
Colonne 'extra_runs': 0 NULL
Colonne 'total_runs': 0 NULL
Colonne 'player_dismissed': 143022 NULL
Colonne 'dismissal_kind': 143022 NULL
Colonne 'fielder': 145091 NULL

Résultats pour 'matches.csv':
Colonne 'id': 0 NULL
Colonne 'season': 0 NULL
Colonne 'city': 7 NULL
Colonne 'date': 0 NULL
Colonne 'team1': 0 NULL
Colonne 'team2': 0 NULL
Colonne 'toss_winner': 0 NULL
Colonne 'toss_decision': 0 NULL
Colonne 'result': 0 NULL
Colonne 'dl_applied': 0 NULL
Colonne 'winner': 3 NULL
Colonne 'win_by_runs': 0 NULL
Colonne 

In [51]:
# Dropping empty umpire3 and adding Unknown to empty others
spark = SparkSession.builder \
    .appName("Replace with unknown or delete") \
    .getOrCreate()

#Delete or change to unknown
matches_df = matches_df.withColumn('umpire1', when(col('umpire1').isNull(), lit("Unknown")).otherwise(col('umpire1')))
matches_df = matches_df.withColumn('umpire2', when(col('umpire2').isNull(), lit("Unknown")).otherwise(col('umpire2')))
matches_df = matches_df.withColumn('winner', when(col('winner').isNull(), lit("Unknown")).otherwise(col('winner')))
matches_df = matches_df.withColumn('player_of_match', when(col('player_of_match').isNull(), lit("Unknown")).otherwise(col('player_of_match')))
matches_df = matches_df.drop('umpire3')

#DOES NOT UPDATE THE DB !!

# Supprimer la colonne "player_dismissal" de deliveries_df
deliveries_df = deliveries_df.drop('player_dismissal').drop('dismissal_kind').drop('fielder')

# Afficher le schéma mis à jour des DataFrames pour vérification
print("Schema de deliveries_df après suppression de 'player_dismissal':")
deliveries_df.printSchema()

print("\nSchema de matches_df après suppression de 'umpire3':")
matches_df.printSchema()

# Fermer la session Spark
spark.stop()

Schema de deliveries_df après suppression de 'player_dismissal':
root
 |-- match_id: integer (nullable = true)
 |-- inning: integer (nullable = true)
 |-- batting_team: string (nullable = true)
 |-- bowling_team: string (nullable = true)
 |-- over: integer (nullable = true)
 |-- ball: integer (nullable = true)
 |-- batsman: string (nullable = true)
 |-- non_striker: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- is_super_over: integer (nullable = true)
 |-- wide_runs: integer (nullable = true)
 |-- bye_runs: integer (nullable = true)
 |-- legbye_runs: integer (nullable = true)
 |-- noball_runs: integer (nullable = true)
 |-- penalty_runs: integer (nullable = true)
 |-- batsman_runs: integer (nullable = true)
 |-- extra_runs: integer (nullable = true)
 |-- total_runs: integer (nullable = true)
 |-- player_dismissed: string (nullable = true)


Schema de matches_df après suppression de 'umpire3':
root
 |-- id: integer (nullable = true)
 |-- season: integer (nullable =