In [0]:
url = "https://raw.githubusercontent.com/deepanshuMeteor/ComputerVision/refs/heads/main/data.csv"
df_pandas = pd.read_csv(url, sep=',')

# Convert pandas DataFrame to Spark DataFrame
df = spark.createDataFrame(df_pandas)

# Drop the 'id' column and rows with missing values
df = df.drop('id').na.drop()

# Convert the 'diagnosis' column to numerical values
from pyspark.sql.functions import when
df = df.withColumn('diagnosis', when(df['diagnosis'] == 'M', 1).otherwise(0))

# Display basic statistics of the data
display(df.describe())

# Display the distribution of the 'diagnosis' column
display(df.groupBy('diagnosis').count())

# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Define the feature columns
feature_cols = df.columns
feature_cols.remove('diagnosis')

# Assemble the features into a single vector
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol='features'
)
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

# Train a Logistic Regression model
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(
    labelCol='diagnosis',
    featuresCol='features'
)
model = lr.fit(train_df)

# Make predictions on the test set
predictions = model.transform(test_df)

# Calculate the percentage of correct and wrong predictions
from pyspark.sql.functions import col
correct_predictions = predictions.filter(col('diagnosis') == col('prediction')).count()
total_predictions = predictions.count()
accuracy = correct_predictions / total_predictions
error_rate = 1 - accuracy

# Display the accuracy and error rate
display(spark.createDataFrame([(accuracy, error_rate)], ["Accuracy", "Error Rate"]))

# Display the predictions
display(predictions.select('features', 'diagnosis', 'prediction'))

summary,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.3725834797891036,14.127291739894552,19.28964850615114,91.96903339191564,654.8891036906855,0.0963602811950791,0.1043409841827768,0.0887993158172232,0.0489191458699472,0.1811618629173989,0.0627976098418277,0.4051720562390157,1.2168534270650264,2.8660592267135323,40.33707908611599,0.007040978910369,0.0254781388400702,0.0318937163444639,0.011796137082601,0.0205422987697715,0.0037949038664323,16.26918980667838,25.677223198594017,107.26121265377856,880.5831282952548,0.1323685940246045,0.2542650439367311,0.2721884833040422,0.114606223198594,0.2900755711775044,0.0839458172231985
stddev,0.4839179564031687,3.5240488262120766,4.30103576816695,24.298981038754903,351.914129181653,0.0140641281376736,0.0528127579325121,0.0797198087078935,0.0388028448591536,0.0274142813360357,0.0070603627950844,0.2773127329861039,0.5516483926172022,2.021854554042108,45.4910055161318,0.003002517943839,0.0179081793256773,0.0301860603229884,0.0061702851740468,0.0082663715287983,0.0026460709670891,4.833241580469323,6.14625762303832,33.60254226903636,569.356992669949,0.0228324294048354,0.157336488913742,0.2086242806081322,0.065732341195942,0.0618674675375187,0.0180612673488939
min,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,0.1115,0.3602,0.757,6.802,0.001713,0.002252,0.0,0.0,0.007882,0.0008948,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
max,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,2.873,4.885,21.98,542.2,0.03113,0.1354,0.396,0.05279,0.07895,0.02984,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


diagnosis,count
1,212
0,357


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy,Error Rate
0.9380530973451328,0.0619469026548672


features,diagnosis,prediction
"Map(vectorType -> dense, length -> 30, values -> List(8.598, 20.98, 54.66, 221.8, 0.1243, 0.08963, 0.03, 0.009259, 0.1828, 0.06757, 0.3582, 2.067, 2.493, 18.39, 0.01193, 0.03162, 0.03, 0.009259, 0.03357, 0.003048, 9.565, 27.04, 62.06, 273.9, 0.1639, 0.1698, 0.09001, 0.02778, 0.2972, 0.07712))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(8.95, 15.76, 58.74, 245.2, 0.09462, 0.1243, 0.09263, 0.02308, 0.1305, 0.07163, 0.3132, 0.9789, 3.28, 16.94, 0.01835, 0.0676, 0.09263, 0.02308, 0.02384, 0.005601, 9.414, 17.07, 63.34, 270.0, 0.1179, 0.1879, 0.1544, 0.03846, 0.1652, 0.07722))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(9.173, 13.86, 59.2, 260.9, 0.07721, 0.08751, 0.05988, 0.0218, 0.2341, 0.06963, 0.4098, 2.265, 2.608, 23.52, 0.008738, 0.03938, 0.04312, 0.0156, 0.04192, 0.005822, 10.01, 19.23, 65.59, 310.1, 0.09836, 0.1678, 0.1397, 0.05087, 0.3282, 0.0849))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(9.787, 19.94, 62.11, 294.5, 0.1024, 0.05301, 0.006829, 0.007937, 0.135, 0.0689, 0.335, 2.043, 2.132, 20.05, 0.01113, 0.01463, 0.005308, 0.00525, 0.01801, 0.005667, 10.92, 26.29, 68.81, 366.1, 0.1316, 0.09473, 0.02049, 0.02381, 0.1934, 0.08988))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(11.31, 19.04, 71.8, 394.1, 0.08139, 0.04701, 0.03709, 0.0223, 0.1516, 0.05667, 0.2727, 0.9429, 1.831, 18.15, 0.009282, 0.009216, 0.02063, 0.008965, 0.02183, 0.002146, 12.33, 23.84, 78.0, 466.7, 0.129, 0.09148, 0.1444, 0.06961, 0.24, 0.06641))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(11.45, 20.97, 73.81, 401.5, 0.1102, 0.09362, 0.04591, 0.02233, 0.1842, 0.07005, 0.3251, 2.174, 2.077, 24.62, 0.01037, 0.01706, 0.02586, 0.007506, 0.01816, 0.003976, 13.11, 32.16, 84.53, 525.1, 0.1557, 0.1676, 0.1755, 0.06127, 0.2762, 0.08851))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(11.93, 21.53, 76.53, 438.6, 0.09768, 0.07849, 0.03328, 0.02008, 0.1688, 0.06194, 0.3118, 0.9227, 2.0, 24.79, 0.007803, 0.02507, 0.01835, 0.007711, 0.01278, 0.003856, 13.67, 26.15, 87.54, 583.0, 0.15, 0.2399, 0.1503, 0.07247, 0.2438, 0.08541))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(12.19, 13.29, 79.08, 455.8, 0.1066, 0.09509, 0.02855, 0.02882, 0.188, 0.06471, 0.2005, 0.8163, 1.973, 15.24, 0.006773, 0.02456, 0.01018, 0.008094, 0.02662, 0.004143, 13.34, 17.81, 91.38, 545.2, 0.1427, 0.2585, 0.09915, 0.08187, 0.3469, 0.09241))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(13.27, 14.76, 84.74, 551.7, 0.07355, 0.05055, 0.03261, 0.02648, 0.1386, 0.05318, 0.4057, 1.153, 2.701, 36.35, 0.004481, 0.01038, 0.01358, 0.01082, 0.01069, 0.001435, 16.36, 22.35, 104.5, 830.6, 0.1006, 0.1238, 0.135, 0.1001, 0.2027, 0.06206))",0,0.0
"Map(vectorType -> dense, length -> 30, values -> List(13.34, 15.86, 86.49, 520.0, 0.1078, 0.1535, 0.1169, 0.06987, 0.1942, 0.06902, 0.286, 1.016, 1.535, 12.96, 0.006794, 0.03575, 0.0398, 0.01383, 0.02134, 0.004603, 15.53, 23.19, 96.66, 614.9, 0.1536, 0.4791, 0.4858, 0.1708, 0.3527, 0.1016))",0,1.0
