In [13]:
import pandas as pd
# Initialize Spark Session
spark = SparkSession.builder.appName("Covid19DataAnalysis").getOrCreate()

# Load the dataset from HDFS
df = pd.read_csv("owid-covid-data.csv")

# Data Preprocessing
df = df.withColumn('date', to_date(df['date'], 'yyyy-MM-dd'))
df = df.na.fill(0)
kenya_df = df.filter(df['location'] == 'Kenya')

# Select features and target variable for the model
feature_columns = ['total_cases', 'new_cases', 'population']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(kenya_df).select('features', 'total_deaths')

# Split the data
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train the model
lr = LinearRegression(featuresCol='features', labelCol='total_deaths')
lr_model = lr.fit(train_data)

# Make predictions
predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="total_deaths", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

# Visualization
pandas_df = predictions.select("total_deaths", "prediction").toPandas()
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x='total_deaths', y='prediction', data=pandas_df)
plt.title('Actual vs Predicted Total Deaths')

plt.subplot(1, 2, 2)
sns.distplot(pandas_df['total_deaths'] - pandas_df['prediction'])
plt.title('Residuals Distribution')
plt.xlabel('Residuals')

plt.tight_layout()
plt.show()

# Stop the Spark session
spark.stop()


AttributeError: 'DataFrame' object has no attribute 'withColumn'