In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import SparkContext, SparkConf
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import when
import time
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, expr, when

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("Self-training") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
#compute the parameter in the dataset
def analyse_the_data(DataFrame,Label):
    # labeled: data with label 1 or 0
    # unlabel: data without label(we use -1 to recognize the unlabeled data)
    unlabeled = DataFrame.filter(col(Label).isin([-1]))
    labeled = DataFrame.filter(col(Label).isin([0,1]))
    label_0 = labeled.filter(col(Label).isin([0])).count()
    label_1 = labeled.filter(col(Label).isin([1])).count()
    ratioNP_P = label_0/label_1
    return labeled,unlabeled,ratioNP_P,label_0,label_1

In [3]:
#create vector
def vector_assembler(DataFrame):                                  
    vecAss = VectorAssembler(
        inputCols=["I", "V", "L", "F", "C", "M", "A", "G", "T", "S", "W", "Y", "P", "H", "E", "Q", "D", "N", "K", "R",
                   "hdHydro_mean", "helical_mean"], outputCol="features")
    DataFrame = vecAss.transform(DataFrame)
    return DataFrame.cache()

In [4]:
#read and preprocess the training dataset
df = spark.read.format('csv').option("header", 'true').load("trainselftraining.csv")
df.cache()
numerical_cols = df.columns
df = df.select([F.col(c).cast("float").alias(c) for c in numerical_cols])
df = vector_assembler(df)

In [5]:
#read and preprocess the testing dataset
test = spark.read.format('csv').option("header", 'true').load("testselftraining.csv") 
numerical_colstest = test.columns
test = test.select([F.col(c).cast("float").alias(c) for c in numerical_colstest])
test = vector_assembler(test)
test = test.withColumn("Cytoplasm",when(col("Cytoplasm").between(2,6),1).otherwise(0))

In [1]:
#main body of training and testing
def train_the_classifer(DataFrame, MAXITER, Label,Threshold):
    # build model
    rf = RandomForestClassifier(labelCol=Label, featuresCol="features", maxDepth=5, numTrees=16,seed = 1234567)
    pipeline = Pipeline(stages=[rf])
    
    labeled, unlabeled, ratioNP_P, label_0, label_1 = analyse_the_data(DataFrame, Label)
    
    #training
    for i in range(MAXITER):
        print(f'Iteration: {i} - labeled size: {labeled.count()},unlabeled size:{unlabeled.count()}')
        training = time.time()

        pipeline_model = pipeline.fit(labeled)
        print(f'Training time: {time.time() - training}')
        
        # Get confident predictions on the current unlabelled
        confidences = time.time()
        
        #predict
        pred_unlabeled = pipeline_model.transform(unlabeled).cache()
        pred_unlabeled = pred_unlabeled.withColumn('xs', vector_to_array('probability'))
        pred_unlabeled = pred_unlabeled.withColumn('class_0_prob', F.col('xs')[0])
        pred_unlabeled = pred_unlabeled.withColumn('class_1_prob', F.col('xs')[1])
        
        # compute the majority of 1s in each class, and that will help us decide how many we add
        zero = pred_unlabeled.filter(pred_unlabeled['class_0_prob'] >= Threshold).cache()
        one = pred_unlabeled.filter(pred_unlabeled['class_1_prob'] >= 0.5).cache()
        class_0 = pred_unlabeled.filter('class_0_prob >= 0.5').count()
        class_1 = pred_unlabeled.filter('class_1_prob >= 0.5').count()

        pred_unlabeled = pred_unlabeled.withColumn(Label, when(col("class_0_prob").between(0.6, 1), 0).when(
            col("class_1_prob").between(0.6, 1), 1).otherwise(-1))
        
        # No new credible predicted data added
        if 0 in (class_0, class_1):
            break
            
        # Add the credible predicted data according to the ratio
        if class_0 > class_1:
            ratio = round(class_1 * ratioNP_P) / class_0
            class_0 = round(class_1 * ratioNP_P)
            print(class_0, class_1, ratioNP_P, ratio)
            if ratio <= 1:
                to_add_class0 = zero.sample(ratio)
                to_add_class1 = one
            else:
                to_add_class0 = zero
                to_add_class1 = one.sample(1/ratioNP_P)
        else:
            ratio = round(class_0 / ratioNP_P) / class_1
            class_1 = round(class_0 / ratioNP_P)
            if ratio <= 1:
                to_add_class0 = zero
                to_add_class1 = one.sample(ratio)
            else:
                to_add_class0 = zero
                to_add_class1 = one.sample(1/ratioNP_P)


        print(f'Adding {class_0} instances from class 0, and {class_1} from class 1')
        print(f'Confidences time: {time.time() - confidences}')
        
        # fix the dataframe of the credible prediction data that will be added to the training set
        # column should be same with the training data
        to_add_class0 = to_add_class0.drop('rawPrediction', 'probability', 'xs', 'class_0_prob', 'class_1_prob',
                                           'prediction').cache()
        to_add_class1 = to_add_class1.drop('rawPrediction', 'probability', 'xs', 'class_0_prob', 'class_1_prob',
                                           'prediction').cache()
        unlabeled = unlabeled.subtract(to_add_class0).subtract(to_add_class1)
        to_add_class0 = to_add_class0.withColumn(Label, lit(0))
        to_add_class1 = to_add_class1.withColumn(Label, lit(1))

        update = time.time()
        
        #process the labeled and unlabeled dataset
        labeled = labeled.union(to_add_class0).union(to_add_class1)
        
        
        pred_unlabeled.unpersist()

        print(f'Updating sets time: {time.time() - update}')
    
    print(f'Iteration: {i} - labeled size: {labeled.count()},unlabeled size:{unlabeled.count()}')
    training = time.time()
    pipeline_model = pipeline.fit(labeled)
    print(f'Training time: {time.time() - training}')
    # test the model
    final_pred_inductive = pipeline_model.transform(test)
    final_pred_inductive = final_pred_inductive.withColumn('TP', when((final_pred_inductive['Cytoplasm']== 1) & (final_pred_inductive['prediction']==1), 1).otherwise(0))
    final_pred_inductive = final_pred_inductive.withColumn('TF', when((final_pred_inductive['Cytoplasm']== 0) & (final_pred_inductive['prediction']==0), 1).otherwise(0))
    final_pred_inductive = final_pred_inductive.withColumn('FP', when((final_pred_inductive['Cytoplasm']== 1) & (final_pred_inductive['prediction']==0), 1).otherwise(0))
    final_pred_inductive = final_pred_inductive.withColumn('FN', when((final_pred_inductive['Cytoplasm']== 0) & (final_pred_inductive['prediction']==1), 1).otherwise(0))
    
    #compute the accuracy
    TP = final_pred_inductive.filter('TP==1').count()
    TN = final_pred_inductive.filter('TF==1').count()
    FP = final_pred_inductive.filter('FP ==1').count()
    FN = final_pred_inductive.filter('FN==1').count()
    acc = (TP+TN)/(TP+TN+FP+FN)
    S = TP/(TP+TN)
    E = TN/(TN+FP)
    return acc,S,E

In [7]:
acc, S, E = train_the_classifer(df,3,"Cytoplasm",0.6)

Iteration: 0 - labeled size: 367,unlabeled size:14002
Training time: 1.4723291397094727
4377 3946 1.1091954022988506 0.43526252983293556
Adding 4377 instances from class 0, and 3946 from class 1
Confidences time: 1.0537374019622803
Updating sets time: 0.04228544235229492
Iteration: 1 - labeled size: 6635,unlabeled size:7690
Training time: 1.9286463260650635
4226 3810 1.1091954022988506 1.0891752577319587
Adding 4226 instances from class 0, and 3810 from class 1
Confidences time: 8.09591555595398
Updating sets time: 0.06188797950744629
Iteration: 2 - labeled size: 13069,unlabeled size:1256
Training time: 87.8832221031189
Adding 509 instances from class 0, and 459 from class 1
Confidences time: 16.752129077911377
Updating sets time: 0.10623836517333984
Iteration: 2 - labeled size: 13856,unlabeled size:469
Training time: 229.89652395248413


In [8]:
#result
acc,S,E

(0.5789473684210527, 0.7662337662337663, 0.4090909090909091)