In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier,GBTClassifier
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.sql.types import IntegerType,LongType,StructField,StructType
from pyspark.ml import pipeline,Pipeline
import pyspark.sql.functions as SF

In [0]:
SS = SparkSession.builder.appName('Tree').getOrCreate()

In [0]:
df = SS.read.format('libsvm').load('/FileStore/tables/sample_libsvm_data-2.txt')

In [0]:
df.show(5)

In [0]:
df.printSchema()

In [0]:
train,test = df.randomSplit([0.7,0.3])

In [0]:
DCT = DecisionTreeClassifier()
RFC = RandomForestClassifier(numTrees=100)
GRA = GBTClassifier()

In [0]:
dct = DCT.fit(train)
rfc = RFC.fit(train)
gra = GRA.fit(train)

In [0]:
dct_pred = dct.transform(test)
rfc_pred = rfc.transform(test)
gra_pred = gra.transform(test)

In [0]:
dct_df = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='label')
rfc_df = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='label')
gra_df = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='label')

In [0]:
dct_df1 = dct_df.evaluate(dct_pred)
rfc_df1 = rfc_df.evaluate(rfc_pred)
gra_df1 = gra_df.evaluate(gra_pred)

In [0]:
dct_df1

In [0]:
rfc_df1

In [0]:
gra_df1

In [0]:
dct_dfm = MulticlassClassificationEvaluator(metricName='accuracy')
rfc_dfm = MulticlassClassificationEvaluator(metricName='accuracy')
gra_dfm = MulticlassClassificationEvaluator(metricName='accuracy')

In [0]:
dct_df2 = dct_dfm.evaluate(dct_pred)
rfc_df2 = rfc_dfm.evaluate(rfc_pred)
gra_df2 = gra_dfm.evaluate(gra_pred)

In [0]:
ss1 = SparkSession.builder.appName('project').getOrCreate()

In [0]:
df1 = ss1.read.table('college_csv')
df2 = sqlContext.sql('select * from college_csv')

In [0]:
df1.printSchema()

In [0]:
df1.show(2)

In [0]:
for x in [i for i in df1.columns][2:]:
  df1 = df1.withColumn(x,df1[x].cast(IntegerType()))

In [0]:
df1.printSchema()

In [0]:
train,test = df1.randomSplit([0.7,0.3])

In [0]:
SI = StringIndexer(inputCols=['Private'],outputCols=['Private1'])
OHE = OneHotEncoder(inputCols=['Private1'],outputCols=['Private2'])

VA1 = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad','P_Undergrad','Outstate','Room_Board','Books',
                                 'Personal','PhD','Terminal','S_F_Ratio','perc_alumni','Expend','Grad_Rate'],outputCol='features')

DCT = DecisionTreeClassifier(labelCol='Private1')
RFC = RandomForestClassifier(labelCol='Private1')
GRA = GBTClassifier(labelCol='Private1')

In [0]:
pipe_dct = Pipeline(stages = [SI,VA1,DCT])
pipe_rfc = Pipeline(stages = [SI,VA1,RFC])
pipe_gra = Pipeline(stages = [SI,VA1,GRA])

In [0]:
train_data = pipe_dct.fit(train)
pred = train_data.transform(test)
Eval_data = pred.select(['Private1','prediction'])

In [0]:
train_data1 = pipe_rfc.fit(train)
pred1 = train_data1.transform(test)
Eval_data1 = pred1.select(['Private1','prediction'])

In [0]:
train_data2 = pipe_gra.fit(train)
pred2 = train_data2.transform(test)
Eval_data2 = pred2.select(['Private1','prediction'])

In [0]:
BCE = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Private1',metricName='accuracy')
BCE1 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Private1',metricName='accuracy')
BCE2 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Private1',metricName='accuracy')


In [0]:
BCE.evaluate(Eval_data)

In [0]:
BCE1.evaluate(Eval_data1)

In [0]:
BCE.evaluate(Eval_data2)

In [0]:
pred2.printSchema()

In [0]:
pred.printSchema()

In [0]:
SF.translate()

In [0]:
SSS = SparkSession.builder.appName('RFC').getOrCreate()

In [0]:
df=SSS.read.csv('/FileStore/tables/dog_food.csv',inferSchema=True,header=True)

In [0]:
df.show()

In [0]:
VA = VectorAssembler(inputCols=['A','B','C','D'],outputCol='features')

In [0]:
df = VA.transform(df)
data  = df.select(['features','Spoiled'])

In [0]:
data.show()

In [0]:
RFC = RandomForestClassifier(featuresCol='features',labelCol='Spoiled')

In [0]:
trainmodel = RFC.fit(data)

In [0]:
trainmodel.featureImportances