# Modèle Random Forest
Ce modèle a été moins développé que les autres mais le code vous est accessible
## Initialisation de PySpark

In [15]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import HashingTF, CountVectorizer, Tokenizer, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.master('local[16]').getOrCreate()

print("SparkContext created")

SparkContext created


## Récupération des données et répartitions des classes

In [16]:
filename = '../resources/training_noemoticon.csv'

schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("id", StringType(), True),
    StructField("date", StringType(), True),
    StructField("query", StringType(), True),
    StructField("author", StringType(), True),
    StructField("tweet", StringType(), True)])

df = spark.read.options(inferSchema=True,
                        ignoreLeadingWhiteSpace=True,
                        schema=schema).csv(filename)
df = df.dropna()

official_col = ['class', 'tweet_id', 'date', 'query', 'username', 'content']

for i, column in enumerate(df.columns):
    df = df.withColumnRenamed(column, official_col[i])

df = df.select("class","content")

print("Dataset class repartitions")
gr = df.groupBy("class").count()
gr.show()

print("Dataset schema :")
df.printSchema()

Dataset class repartitions
+-----+------+
|class| count|
+-----+------+
|    0|800000|
|    4|800000|
+-----+------+

Dataset schema :
root
 |-- class: integer (nullable = true)
 |-- content: string (nullable = true)



## Traitement des features

In [17]:
# chooses CountVectorize or HashingTF
cVec = False

tokenizer = Tokenizer(inputCol="content", outputCol="words")
df = tokenizer.transform(df)

if cVec:
    cv = CountVectorizer(inputCol="words", outputCol="r_features")
    df = cv.fit(df)
else:
    hashtf = HashingTF(inputCol="words", outputCol="r_features")
    df = hashtf.transform(df)

idf = IDF(inputCol="r_features", outputCol="features")

step = idf.fit(df)
df = step.transform(df)

## Partage entrainement et test (80/20)

In [18]:
# split the data into training and test sets
train, test = df.randomSplit([0.8, 0.2])

## Entrainement et évaluation du modèle

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
rf = RandomForestClassifier(labelCol="class", featuresCol="features", maxDepth=10)

model = rf.fit(train)

# run on test data
predictions = model.transform(test)

# evaluate
print()
print("Accuracy = ", evaluator.evaluate(predictions))

predictions.groupBy('class','prediction').count().show()