<a href="https://colab.research.google.com/github/Rajaanthonysamy/pyspark/blob/main/02_pyspark_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import pandas as pd
import random

# Generate sample data
data = {
    'username': [f'user_{i+1}' for i in range(5)],
    'age': [random.randint(18, 60) for _ in range(5)]
}

df = pd.DataFrame(data)

# Save to CSV
csv_filename = 'user_data.csv'
df.to_csv(csv_filename, index=False)

print(f"Dataset saved to {csv_filename}")
print(df)

Dataset saved to user_data.csv
  username  age
0   user_1   40
1   user_2   50
2   user_3   46
3   user_4   35
4   user_5   47


In [35]:
from pyspark.sql import SparkSession

spark_session = SparkSession.builder.appName("Spark_Practice").getOrCreate()

In [36]:
spark_session

In [37]:
df = spark_session.read.option("header", "true").csv("user_data.csv",inferSchema=True)

In [38]:
df.summary()

DataFrame[summary: string, username: string, age: string]

In [39]:
df.printSchema()

root
 |-- username: string (nullable = true)
 |-- age: integer (nullable = true)



In [40]:
df_pyspark = spark_session.read.csv("user_data.csv",header=True,inferSchema=True)

In [41]:
df_pyspark.printSchema()

root
 |-- username: string (nullable = true)
 |-- age: integer (nullable = true)



In [42]:
type(df_pyspark)

In [43]:
df_pyspark.columns

['username', 'age']

In [44]:
df_pyspark.head(3)

[Row(username='user_1', age=40),
 Row(username='user_2', age=50),
 Row(username='user_3', age=46)]

In [45]:
df_pyspark.select("username").show()

+--------+
|username|
+--------+
|  user_1|
|  user_2|
|  user_3|
|  user_4|
|  user_5|
+--------+



In [46]:
df_pyspark.select(["username","age"]).show()

+--------+---+
|username|age|
+--------+---+
|  user_1| 40|
|  user_2| 50|
|  user_3| 46|
|  user_4| 35|
|  user_5| 47|
+--------+---+



In [47]:
df_pyspark['username']

Column<'username'>

In [48]:
df_pyspark.dtypes

[('username', 'string'), ('age', 'int')]

In [49]:
df_pyspark.describe()

DataFrame[summary: string, username: string, age: string]

In [50]:
df_pyspark.describe().show()

+-------+--------+-----------------+
|summary|username|              age|
+-------+--------+-----------------+
|  count|       5|                5|
|   mean|    NULL|             43.6|
| stddev|    NULL|6.024948132556828|
|    min|  user_1|               35|
|    max|  user_5|               50|
+-------+--------+-----------------+



In [51]:
df_pyspark = df_pyspark.withColumn("experience",df_pyspark["age"]-15)

In [52]:
df_pyspark.show()

+--------+---+----------+
|username|age|experience|
+--------+---+----------+
|  user_1| 40|        25|
|  user_2| 50|        35|
|  user_3| 46|        31|
|  user_4| 35|        20|
|  user_5| 47|        32|
+--------+---+----------+



In [53]:
df_pyspark = df_pyspark.drop("experience")

In [54]:
df_pyspark.show()

+--------+---+
|username|age|
+--------+---+
|  user_1| 40|
|  user_2| 50|
|  user_3| 46|
|  user_4| 35|
|  user_5| 47|
+--------+---+



In [55]:
df_pyspark.withColumnRenamed("age","Age").show()

+--------+---+
|username|Age|
+--------+---+
|  user_1| 40|
|  user_2| 50|
|  user_3| 46|
|  user_4| 35|
|  user_5| 47|
+--------+---+

