In [0]:
# Read CSV file into a Spark DataFrame
file_path = "/FileStore/tables/loan-2.csv"  # Path to the uploaded file
loan_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema and first few rows
loan_df.printSchema()
loan_df.show(5)


root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| D

In [0]:
# 1. Number of loans in each category
loan_df.groupBy("Loan Category").count().show()


+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [0]:
# 2. Number of People Who Have Taken More Than 1 Lakh Loan

# Filter for loans greater than 1,00,000
high_loan_df = loan_df.filter(loan_df["Loan Amount"] > 100000)
high_loan_count = high_loan_df.count()

# Show the count
print(f"Number of people with loans greater than 1 lakh: {high_loan_count}")



Number of people with loans greater than 1 lakh: 0


In [0]:
# 3. Number of People With Income Greater Than 60,000 Rupees

# Filter for income greater than 60,000
high_income_df = loan_df.filter(loan_df["Income"] > 60000)
high_income_count = high_income_df.count()

# Show the count
print(f"Number of people with income greater than 60,000: {high_income_count}")


Number of people with income greater than 60,000: 198


In [0]:
# 4. Number of People With 2+ Returned Cheques and Income Less Than 50,000

# Filter for 2+ returned cheques and income < 50,000
returned_cheques_df = loan_df.filter((loan_df[" Returned Cheque"] >= 2) & (loan_df["Income"] < 50000))
returned_cheques_count = returned_cheques_df.count()

# Show the count
print(f"Number of people with 2+ returned cheques and income < 50,000: {returned_cheques_count}")


Number of people with 2+ returned cheques and income < 50,000: 137


In [0]:
# 5. Number of People With 2+ Returned Cheques and Are Single

# Filter for 2+ returned cheques and single status
single_status_df = loan_df.filter((loan_df[" Returned Cheque"] >= 2) & (loan_df["Marital Status"] == "Single"))
single_status_count = single_status_df.count()

# Show the count
print(f"Number of people with 2+ returned cheques and are single: {single_status_count}")


Number of people with 2+ returned cheques and are single: 0


In [0]:
# 6. Number of People With Expenditure Over 50,000 a Month

# Filter for expenditure > 50,000
high_expenditure_df = loan_df.filter(loan_df["Expenditure"] > 50000)
high_expenditure_count = high_expenditure_df.count()

# Show the count
print(f"Number of people with monthly expenditure > 50,000: {high_expenditure_count}")


Number of people with monthly expenditure > 50,000: 6


In [0]:
# 7. Number of Members Eligible for a Credit Card

# Filter for credit card eligibility
credit_card_df = loan_df.filter((loan_df["Income"] > 60000) | (loan_df[" Returned Cheque"] == 0))
credit_card_count = credit_card_df.count()

# Show the count
print(f"Number of people eligible for credit card: {credit_card_count}")


Number of people eligible for credit card: 231


In [0]:
# Read the uploaded CSV file into a Spark DataFrame
file_path = "/FileStore/tables/credit_card.csv"  # Path to your file
credit_card_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema and the first few rows to understand the dataset
credit_card_df.printSchema()
credit_card_df.show(5)


root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France

In [0]:
# 1. credit card users in Spain

# Filter for credit card users in Spain
spain_users_df = credit_card_df.filter(credit_card_df["Geography"] == "Spain")
spain_users_count = spain_users_df.count()

# Display the count
print(f"Number of credit card users in Spain: {spain_users_count}")



Number of credit card users in Spain: 2477


In [0]:
# 2. number of members who are elgible and active in the bank

# Filter for eligible and active members
eligible_active_df = credit_card_df.filter(
    (credit_card_df["Exited"] == "Eligible") &  # Replace with the actual eligibility column/condition
    (credit_card_df["IsActiveMember"] == "Active")     # Replace with the actual active status column
)
eligible_active_count = eligible_active_df.count()

# Display the count
print(f"Number of eligible and active members: {eligible_active_count}")


Number of eligible and active members: 0


In [0]:
# Load the file
file_path = "/FileStore/tables/txn.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the first few rows of the DataFrame
df.show()


+-------------+--------------------+----------+----------------+-------------+-----------+
|   Account No| TRANSACTION DETAILS|VALUE DATE| WITHDRAWAL AMT | DEPOSIT AMT |BALANCE AMT|
+-------------+--------------------+----------+----------------+-------------+-----------+
|409000611074'|TRF FROM  Indiafo...| 29-Jun-17|            null|    1000000.0|  1000000.0|
|409000611074'|TRF FROM  Indiafo...|  5-Jul-17|            null|    1000000.0|  2000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 18-Jul-17|            null|     500000.0|  2500000.0|
|409000611074'|TRF FRM  Indiafor...|  1-Aug-17|            null|    3000000.0|  5500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            null|     500000.0|  6000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            null|     500000.0|  6500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            null|     500000.0|  7000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            null|     500000.0|  7500000.0|

In [0]:
# 1. Maximum withdrawal amount in transactions

# Maximum withdrawal amount
max_withdrawal = df.filter(df[" WITHDRAWAL AMT "] == "Withdrawal").agg({"TRANSACTION DETAILS": "max"})
max_withdrawal.show()

+------------------------+
|max(TRANSACTION DETAILS)|
+------------------------+
|                    null|
+------------------------+



In [0]:
# 2. Minimum withdrawal amount

min_withdrawal = df.filter(df[" WITHDRAWAL AMT "] == "Withdrawal").agg({"TRANSACTION DETAILS": "min"})
min_withdrawal.show()

+------------------------+
|min(TRANSACTION DETAILS)|
+------------------------+
|                    null|
+------------------------+



In [0]:
# 3. MAXIMUM DEPOSIT AMOUNT OF AN ACCOUNT

# Maximum deposit amount
max_deposit = df.filter(df[" DEPOSIT AMT "] == "Deposit").agg({"Account No": "max"})
max_deposit.show()

+---------------+
|max(Account No)|
+---------------+
|           null|
+---------------+



In [0]:
# 4.  Minimum DEPOSIT AMOUNT OF AN ACCOUNT

# Minimum deposit amount
min_deposit = df.filter(df[" DEPOSIT AMT "] == "Deposit").agg({"Account No": "min"})
min_deposit.show()

+---------------+
|min(Account No)|
+---------------+
|           null|
+---------------+



In [0]:
# 5. Sum of balance in every bank account

# Sum of balances
total_balance = df.groupBy("Account No").agg({"BALANCE AMT": "sum"}).withColumnRenamed("sum(Balance)", "TotalBalance")
total_balance.show()


+-------------+--------------------+
|   Account No|    sum(BALANCE AMT)|
+-------------+--------------------+
|409000438611'|-2.49486577068339...|
|     1196711'|-1.60476498101275E13|
|     1196428'| -8.1418498130721E13|
|409000493210'|-3.27584952132095...|
|409000611074'|       1.615533622E9|
|409000425051'|-3.77211841164998...|
|409000405747'|-2.43108047067000...|
|409000493201'|1.0420831829499985E9|
|409000438620'|-7.12291867951358...|
|409000362497'| -5.2860004792808E13|
+-------------+--------------------+



In [0]:
# 6. Number of transaction on each date

# Count transactions by date
transactions_per_date = df.groupBy("VALUE DATE").count().withColumnRenamed("count", "TransactionCount")
transactions_per_date.show()


+----------+----------------+
|VALUE DATE|TransactionCount|
+----------+----------------+
| 23-Dec-16|             143|
|  7-Feb-19|              98|
| 21-Jul-15|              80|
|  9-Sep-15|              91|
| 17-Jan-15|              16|
| 18-Nov-17|              53|
| 21-Feb-18|              77|
| 20-Mar-18|              71|
| 19-Apr-18|              71|
| 21-Jun-16|              97|
| 17-Oct-17|             101|
|  3-Jan-18|              70|
|  8-Jun-18|             223|
| 15-Dec-18|              62|
|  8-Aug-16|              97|
| 17-Dec-16|              74|
|  3-Sep-15|              83|
| 21-Jan-16|              76|
|  4-May-18|              92|
|  7-Sep-17|              94|
+----------+----------------+
only showing top 20 rows



In [0]:
# 7. List of customers with withdrawal amount more than 1 lakh

# Filter withdrawals greater than 1 lakh
high_withdrawals = df.filter((df[" WITHDRAWAL AMT "] > 100000))

# Select relevant columns: Account No, WITHDRAWAL AMT, VALUE DATE
high_withdrawals.select("Account No", " WITHDRAWAL AMT ", "VALUE DATE").show()



+-------------+----------------+----------+
|   Account No| WITHDRAWAL AMT |VALUE DATE|
+-------------+----------------+----------+
|409000611074'|        133900.0| 16-Aug-17|
|409000611074'|        195800.0| 16-Aug-17|
|409000611074'|        143800.0| 16-Aug-17|
|409000611074'|        331650.0| 16-Aug-17|
|409000611074'|        129000.0| 16-Aug-17|
|409000611074'|        230013.0| 16-Aug-17|
|409000611074'|        367900.0| 16-Aug-17|
|409000611074'|        108000.0| 16-Aug-17|
|409000611074'|        141000.0| 16-Aug-17|
|409000611074'|        206000.0| 16-Aug-17|
|409000611074'|        242300.0|  6-Sep-17|
|409000611074'|        113250.0|  6-Sep-17|
|409000611074'|        206900.0|  6-Sep-17|
|409000611074'|        276000.0|  6-Sep-17|
|409000611074'|        171000.0|  6-Sep-17|
|409000611074'|        189800.0|  6-Sep-17|
|409000611074'|        271323.0|  6-Sep-17|
|409000611074'|        200600.0|  6-Sep-17|
|409000611074'|        176900.0|  6-Sep-17|
|409000611074'|        150050.0|