# Assignments

## Assignment - 1

In [13]:
import findspark

findspark.init()

In [14]:
import pyspark

from pyspark.sql import SparkSession

from pyspark.sql.functions import *
from pyspark.sql.types import *

In [15]:
spark = SparkSession.builder.master("local").appName("SparkApp").getOrCreate()

## Find min and max temperatures in given weather_data
### *The data is seperated by one or more spaces. So first transform the data and do the calculations*

In [16]:
path  = "D:/futurense_hadoop-pyspark/labs/dataset/weather/weather_data.txt"

In [17]:
columns = ['WBANNO',
 'LST_DATE',
 'CRX_VN',
 'LONGITUDE',
 'LATITUDE',
 'T_DAILY_MAX',
 'T_DAILY_MIN',
 'T_DAILY_MEAN',
 'T_DAILY_AVG',
 'P_DAILY_CALC',
 'SOLARAD_DAILY',
 'SUR_TEMP_DAILY_TYPE',
 'SUR_TEMP_DAILY_MAX',
 'SUR_TEMP_DAILY_MIN',
 'SUR_TEMP_DAILY_AVG',
 'RH_DAILY_MAX',
 'RH_DAILY_MIN',
 'RH_DAILY_AVG',
 'SOIL_MOISTURE_5_DAILY',
 'SOIL_MOISTURE_10_DAILY',
 'SOIL_MOISTURE_20_DAILY',
 'SOIL_MOISTURE_50_DAILY',
 'SOIL_MOISTURE_100_DAILY',
 'SOIL_TEMP_5_DAILY',
 'SOIL_TEMP_10_DAILY',
 'SOIL_TEMP_20_DAILY',
 'SOIL_TEMP_50_DAILY',
 'SOIL_TEMP_100_DAILY']


### First read the data remove extra spaces and make it as comma seperated values

In [18]:
data = spark.read.load(path,format="csv")

#df = data.withColumn("values", regexp_replace(data.values, "\s+", ",")).withColumn("values",split(data.value,","))

df = data.withColumn("_c0",regexp_replace(data._c0,"\s+",","))

### Split the data based on ',' and store values in respective columns

In [19]:

split_data = df.select(split(df._c0, ",").alias("data"))


converted_data = split_data.select(
    *[split_data.data.getItem(idx).alias(columns[idx]) for idx in range(len(columns))])
    

#select the required the columns and type cast them into required datatype     
min_max_df = converted_data.select(col("T_DAILY_MAX").cast(FloatType()).alias("T_DAILY_MAX"),col("T_DAILY_MIN").cast(FloatType()).alias("T_DAILY_MIN"))

# find the min from the daily_min and max from daily_max
min_max_df.agg(min("T_DAILY_MIN").alias("Min_temp"),max("T_DAILY_MAX").alias("max_temp")).show()

+--------+--------+
|Min_temp|max_temp|
+--------+--------+
|    -7.9|    36.0|
+--------+--------+



## Find the max and min temperatures in each month

In [20]:
months = {'01':'Jan','02':'Feb','03':'Mar','04':'Apr','05':'May','06':'Jun',
          '07':'Jul','08':'Aug','09':'Sep','10':'Oct','11':'Nov','12':'Dec'}

def find_month_name(month_num):
    return months[month_num]

month_df = udf(find_month_name,StringType())


df = converted_data.withColumn('MONTH',month_df(substring('LST_DATE',5,2)))\
                   .select(
                            col('MONTH'),col("T_DAILY_MAX").cast(FloatType()).alias("T_DAILY_MAX"),
                            col("T_DAILY_MIN").cast(FloatType()).alias("T_DAILY_MIN")
                          )

df.groupBy('MONTH').agg(min('T_DAILY_MIN').alias('Min_temp'),max('T_DAILY_MAX').alias('Max_temp')).show()

+-----+--------+--------+
|MONTH|Min_temp|Max_temp|
+-----+--------+--------+
|  May|    14.3|    31.1|
|  Jun|     0.0|    33.6|
|  Feb|    -3.5|    26.6|
|  Mar|    -3.2|    29.1|
|  Jan|    -7.9|    26.5|
|  Apr|     8.0|    30.8|
|  Jul|    19.8|    36.0|
+-----+--------+--------+



# Assignment-2 
## Find the total count of ratings for each type of rating

In [29]:
path = r'C:/Users/rakes/OneDrive/Desktop/ml-latest-small/ml-latest-small/ratings.csv'

In [34]:
ratings_df = spark.read.csv(path,header=True)

ratings_df.groupBy('rating').count().show()

+------+-----+
|rating|count|
+------+-----+
|   1.0| 2811|
|   4.5| 8551|
|   2.5| 5550|
|   3.5|13136|
|   5.0|13211|
|   0.5| 1370|
|   4.0|26818|
|   1.5| 1791|
|   2.0| 7551|
|   3.0|20047|
+------+-----+



# Assignment-3

## Bankmarket- Analysis

In [35]:
path = 'D:/futurense_hadoop-pyspark/labs/dataset/bankmarket/bankmarketdata.csv'

### 1.	Load data and create a DataFrame

In [60]:
bank_df = spark.read.csv(path,header=True,sep=';',inferSchema=True)
bank_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



### 2.	Give marketing success rate. (No. of people subscribed / total no. of entries)

In [48]:
total_count = bank_df.count()

success_rate = bank_df.filter(bank_df['y'] == 'yes').count() / total_count * 100

print("Success rate:",success_rate)


Success rate: 11.698480458295547


   ### 3.Give marketing failure rate

In [52]:
failure_rate = bank_df.filter(bank_df.y == 'no').count() / total_count * 100

print("Failure rate:",failure_rate)

Failure rate: 88.30151954170445


### 4.	Maximum, Mean, and Minimum age of the average targeted customer

In [64]:
bank_df.select('age').agg(min(bank_df.age).alias('Min_Age'),
                          max(bank_df.age).alias('Max_Age'),
                          round(avg(bank_df.age),2).alias('Avg_age')).show()

+-------+-------+-------+
|Min_Age|Max_Age|Avg_age|
+-------+-------+-------+
|     18|     95|  40.94|
+-------+-------+-------+



### 5.	Check the quality of customers by checking the average balance, median balance of customers

In [74]:
bank_df.agg(
            round(avg(bank_df.balance),2).alias("Avg_balance"),
            percentile_approx(bank_df.balance,0.5).alias("Median_balance")
            ).show()

+-----------+--------------+
|Avg_balance|Median_balance|
+-----------+--------------+
|    1362.27|           448|
+-----------+--------------+



### 6.	Check if age matters in marketing subscription for deposit

In [98]:
bank_df.select('age','y').filter(bank_df['y'] == 'yes').groupBy('age')\
    .count().orderBy('age').show()

+---+-----+
|age|count|
+---+-----+
| 18|    7|
| 19|   11|
| 20|   15|
| 21|   22|
| 22|   40|
| 23|   44|
| 24|   68|
| 25|  113|
| 26|  134|
| 27|  141|
| 28|  162|
| 29|  171|
| 30|  217|
| 31|  206|
| 32|  221|
| 33|  210|
| 34|  198|
| 35|  209|
| 36|  195|
| 37|  170|
+---+-----+
only showing top 20 rows



In [107]:
bank_df.select('age','y').filter(bank_df.y == 'yes').groupBy('age').count().show()

+---+-----+
|age|count|
+---+-----+
| 31|  206|
| 85|    4|
| 65|   21|
| 53|   85|
| 78|   14|
| 34|  198|
| 81|    6|
| 28|  162|
| 76|   16|
| 26|  134|
| 27|  141|
| 44|   93|
| 22|   40|
| 93|    2|
| 47|  113|
| 52|   85|
| 86|    4|
| 40|  116|
| 20|   15|
| 57|   78|
+---+-----+
only showing top 20 rows



### 7.	Check if marital status mattered for subscription to deposit.

In [99]:
bank_df.select('marital','y').filter(bank_df.y == 'yes').groupBy('marital').count().show()                       

+--------+-----+
| marital|count|
+--------+-----+
|divorced|  622|
| married| 2755|
|  single| 1912|
+--------+-----+



### 8.	Check if age and marital status together mattered for subscription to deposit scheme

In [101]:
bank_df.select('age','marital','y')\
        .filter(bank_df.y == 'yes').groupBy('age','marital')\
        .count().orderBy('age').show()                       

+---+--------+-----+
|age| marital|count|
+---+--------+-----+
| 18|  single|    7|
| 19|  single|   11|
| 20|  single|   14|
| 20| married|    1|
| 21|  single|   21|
| 21| married|    1|
| 22|  single|   40|
| 23| married|    2|
| 23|  single|   42|
| 24|  single|   58|
| 24| married|   10|
| 25| married|   14|
| 25|  single|   99|
| 26| married|   13|
| 26|  single|  121|
| 27| married|   29|
| 27|  single|  110|
| 27|divorced|    2|
| 28| married|   20|
| 28|  single|  138|
+---+--------+-----+
only showing top 20 rows

