In [None]:
pip install pyxlsb

Collecting pyxlsb
  Downloading pyxlsb-1.0.10-py2.py3-none-any.whl (23 kB)
Installing collected packages: pyxlsb
Successfully installed pyxlsb-1.0.10


In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=f9b18177920ba89699ad3be493d41b1b104a91f0b0bfad227cee6354f1e1d486
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 1: Initialize SparkSession
spark = SparkSession.builder \
    .appName("DiseasePrediction") \
    .getOrCreate()

# Step 2: Load data
data_old = pd.read_excel("/content/generated_data.xlsb")
data = spark.createDataFrame(data_old)

# Step 3: Encode categorical variables
categorical_columns_data = ["Gender", "Fever", "Cough", "Headache", "Nausea", "Vomiting", "pain chest",
                       "shortness of breath", "dizziness", "asthenia", "fall", "syncope", "vertigo",
                       "sweat", "sweating increased","palpitation","angina pectoris","pressure chest",
                       "polyuria","polydypsia","orthopnea","rale","unresponsiveness","hallucinations visual",
                       "bedridden","prostatism"]

# Target variable encoding
labelindex = StringIndexer(inputCol="Disease", outputCol="Disease_index")
data = labelindex.fit(data).transform(data)

# Categorical variable encoding
indexers_to_encode = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep")
            for column in categorical_columns_data]

# Fit and transform the DataFrame with StringIndexer
for indexer in indexers_to_encode:
    data = indexer.fit(data).transform(data)

# Step 4: Assemble features
featurecolumns = [col + "_index" for col in categorical_columns_data]
assembler = VectorAssembler(inputCols=featurecolumns, outputCol="features")
preprocessed_data = assembler.transform(data)

# Step 5: Train-Test Split (optional)
# In PySpark, train-test split is usually not required as it handles the entire dataset

# Step 6: Model Training and Evaluation
diff_models_used = [
    RandomForestClassifier(featuresCol="features", labelCol="Disease_index", numTrees=100, seed=42),
    DecisionTreeClassifier(featuresCol="features", labelCol="Disease_index", seed=42),
    LogisticRegression(featuresCol="features", labelCol="Disease_index")]

for classifier in diff_models_used:
    # Train the model
    model = classifier.fit(preprocessed_data)

    # Make predictions
    predictions_made = model.transform(preprocessed_data)

    # Evaluate the model
    evaluator_for_model = MulticlassClassificationEvaluator(labelCol="Disease_index", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_for_model.evaluate(predictions_made)
    print("Model Name:", type(classifier).__name__)
    print("Accuracy Achieved:", accuracy)

# Stop Spark session
spark.stop()


Model Name: RandomForestClassifier
Accuracy Achieved: 0.2676985353970708
Model Name: DecisionTreeClassifier
Accuracy Achieved: 0.25721651443302884
Model Name: LogisticRegression
Accuracy Achieved: 0.2533785067570135


In [None]:
from pyspark.ml.classification import GBTClassifier, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

gbt = GBTClassifier(featuresCol="features", labelCol="Disease_index", maxIter=10)

new_model = OneVsRest(classifier=gbt, labelCol="Disease_index")

ovrmodel = new_model.fit(preprocessed_data)

predictions = ovrmodel.transform(preprocessed_data)

evaluator_for_model = MulticlassClassificationEvaluator(labelCol="Disease_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_for_model.evaluate(predictions)
print("Accuracy Achieved:", accuracy)


Accuracy Achieved: 0.26829053658107316
