# Functions!

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
nltk.data.path.append("/home/romaric/code/Romaric1209/project-roma/notebooks/roma_NTLK_Data_Cache")
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from textblob import TextBlob
from gensim.models import LsiModel
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import textstat


In [None]:

cmu_dict = cmudict.dict()  # This should load from the cache
print(cmu_dict["hello"])

In [None]:
def word_count(text):
    if not isinstance(text, str):  # Convert to string if it's not
       text = str(text)
    return len(text.split())

In [None]:
def basic_cleaning(text):
    if not isinstance(text, str):  # Convert to string if it's not
       text = str(text)
    # Remove whitespace
    prepoc_text = text.strip()
    # Lowercasing
    prepoc_text = prepoc_text.lower()
    # remove digits
    prepoc_text = "".join(char for char in prepoc_text if not char.isdigit())
    # remove punctuation
    for punctuation in string.punctuation:
        prepoc_text = prepoc_text.replace(punctuation," ")
    # remove regex
    prepoc_text = re.sub('<[^<]+?',"",prepoc_text)

    return prepoc_text

In [None]:
def cons_density(text):

    consonnant = sum(1 for char in text if char.isalpha() and char not in "aeiou")
    vowel = sum(1 for char in text if char.isalpha() and char in "aeiou")
    total_letters = vowel + consonnant
    return round((consonnant/(vowel + consonnant)),3) if total_letters > 0 else 0

In [None]:
cmu_dict = cmudict.dict()

def get_word_stress(word):
    if word in cmu_dict:
        return sum(int(char) for syllable in cmu_dict[word][0] for char in syllable if char.isdigit())
    return 0

def get_sentence_stress(sentence):
    words = sentence.split()
    stress_values = [get_word_stress(word) for word in words]
    return sum(stress_values)

In [None]:
def redundance(text):
    # give a redundance score, considering the lenght of each text, if a lemmatized words appears more than three times the mean, it is considered redundant.

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    clean_tokens = [w for w in tokens if w not in stop_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in clean_tokens]

    word_counts = Counter(lemmatized_tokens)
    mean_freq = sum(word_counts.values()) / len(word_counts) if len(word_counts)!= 0 else 0

    if mean_freq != 0:
        score = sum(1 for word, count in word_counts.items() if count > 2.5 * mean_freq)
    else:
        score = 0

    return score

In [None]:
def sentiment_polarity(text):
    sent_pol = TextBlob(text).sentiment.polarity
    return abs(round(sent_pol,3))

In [None]:
def word_choice(text):
    common_ai_words =["commendable",'transhumanist', 'meticulous', 'elevate','hello', 'tapestry','leverage',
                  'journey', 'headache','resonate','testament','explore', 'binary','delve',
                  'enrich', 'seamless','multifaceted', 'sorry','foster', 'convey', 'beacon',
                  'interplay', 'oh', 'navigate','form','adhere','cannot', 'landscape','remember',
                  'paramount', 'comprehensive', 'placeholder','grammar','real','summary','symphony',
                  'furthermore','relationship','ultimately','profound','art','supercharge','evolve',
                  'beyoud','reimagine','vibrant', 'robust','pivotal','certainly','quinoa','orchestrate','align',
                  'diverse','recommend','annals','note','employ','bustling','indeed','digital','enigma', 'outfit',
                  'indelible','refrain','culture','treat','emerge','meticulous','esteemed','weight','whimsical','bespoke',
                  'highlight','antagonist','unlock','key','breakdown','tailor','misinformation','treasure','paradigm','captivate',
                  'song','underscore','calculate','especially','climate','hedging','inclusive','exercise','ai','embrace',
                  'level','nuance','career','dynamic','accent','ethos','cheap','firstly','online','goodbye'
                  ]
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    word_count = 0
    for word in text.split():
        if word in common_ai_words:
            word_count += 1

    return word_count

In [None]:
def coherence(text):
    # uses gensim to measure coherence, use the lsi model(latent semantic indexing, coherence c_v because we provide the text)
    tokens = word_tokenize(text)
    if not tokens:
        coherence_score = 0
    else:
        dictionary = corpora.Dictionary([tokens])
        corpus_gensim = [dictionary.doc2bow(tokens)]
        lsa_model = LsiModel(corpus_gensim, id2word=dictionary)

        coherence_model = CoherenceModel(
            model=lsa_model,
            texts=[tokens],
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_score = coherence_model.get_coherence()
    return coherence_score

In [None]:
def reading_ease(text):
    reading_ease= textstat.flesch_reading_ease(text)
    return reading_ease


def gunning_fog(text):
    gunning_fog = textstat.gunning_fog(text)
    return gunning_fog

In [None]:
text = "Hello is the cat cat cat name, it is commendable. The cat eat the cat fish that was in the bowl of the cat, the cat is a bad cat!"
print(f'word count :{word_count(text)}')
print(f'cleaned :{basic_cleaning(text)}')
print(f'consonnance density :{cons_density(text)}')
print(f'stress value :{get_sentence_stress(text)}')
print(f'redundance :{redundance(text)}')
print(f'sentiment :{sentiment_polarity(text)}')
print(f'unusual word count :{word_choice(text)}')
print(f'coherence :{coherence(text)}')
print(f'reading ease :{reading_ease(text)}')
print(f'gunning fog :{gunning_fog(text)}')

# Create Pipeline
We want to add columns, not transform them ==> no ColumnTransformer <br>
Function transformer?<br>
But firt we need to get our preprocessed data...

In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer

In [6]:
import pandas as pd
data_load = pd.read_csv("/home/romaric/code/Romaric1209/project-roma/data/1k_sampled_dataset.csv")
data = data_load.copy()
data.head()

Unnamed: 0,text,source,prompt_id,text_length,word_count
0,The Philosophy and Ethics of Transhumanism\n\n...,GPT-3.5,1920,2558,394
1,Crime-tracking app Citizen is launching its ow...,Flan-T5-XXL,0,378,62
2,The court in Novorossiysk gave two of the danc...,GLM-130B,0,621,109
3,"then drops the drumsticks, poses, then walks o...",GPT-J,0,513,90
4,On tally went to the beach. She found a sand d...,GPT-J,0,4984,846


In [7]:
data["AI_gen"] = data["source"].apply(lambda x: 0 if x == "Human" else 1)

In [8]:
X=pd.DataFrame(data["text"])
y=data["AI_gen"]

## Test No parallelism

In [None]:
class InputHandler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, str):
            X = [X]
        if isinstance(X, list):
            X = pd.DataFrame({"text": X})
        elif isinstance(X, pd.DataFrame):
            if "text" not in X.columns:
                raise ValueError("Input DataFrame must have a 'text' column")
        else:
            X = pd.DataFrame({"text": list(X)})
        return X

class HowManyWords(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["word_count"]

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X["text"]
        word_c = X.apply(word_count)
        return pd.DataFrame({"word_count": word_c})

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["preprocessed"]

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X["text"]
        cleaned = X.apply(basic_cleaning)
        return pd.DataFrame({"preprocessed": cleaned})

class ConsDensity(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["cons_density"]

    def transform(self, X):
        return X["preprocessed"].apply(cons_density).values.reshape(-1, 1)

class Stress(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["stress_value"]

    def transform(self, X):
        return X["preprocessed"].apply(get_sentence_stress).values.reshape(-1, 1)

class Sentiment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["sentiment_score"]

    def transform(self, X):
        return X["preprocessed"].apply(sentiment_polarity).values.reshape(-1, 1)

class Redundance(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["redundance"]

    def transform(self, X):
        return X["preprocessed"].apply(redundance).values.reshape(-1, 1)

class UnusualWord(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["unusual_words"]

    def transform(self, X):
        return X["preprocessed"].apply(word_choice).values.reshape(-1, 1)

class Coherence(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["coherence"]

    def transform(self, X):
        return X["preprocessed"].apply(coherence).values.reshape(-1, 1)

class ReadingEase(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["reading_ease"]

    def transform(self, X):
        return X["text"].apply(reading_ease).values.reshape(-1, 1)

class GunningFog(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["gunning_fog"]

    def transform(self, X):
        return X["text"].apply(gunning_fog).values.reshape(-1, 1)


In [None]:
log_scaler = FunctionTransformer(lambda x: np.log1p(x), validate=True)

pipeline = Pipeline([
    ("input_handler", InputHandler()),
    ("union", FeatureUnion([
        ("preprocessed_features", Pipeline([
            ("preprocessor", TextPreprocessor()),
            ("features", FeatureUnion([
                ("cons_density", ConsDensity()),
                ("stress_value", Pipeline([
                    ("extract", Stress()),
                    ("scaler", MinMaxScaler())
                ])),
                ("sentiment_score", Sentiment()),
                ("redundance", Pipeline([
                    ("extract", Redundance()),
                    ("log_scaling", log_scaler)
                ])),
                ("unusualword", Pipeline([
                    ("extract", UnusualWord()),
                    ("log_scaling", log_scaler)
                ])),
                ("coherence", Coherence())
            ]))
        ])),
        ("original_text_features", Pipeline([
            ("features", FeatureUnion([
                ("wordcount", Pipeline([
                    ("extract", HowManyWords()),
                    ("scaler", MinMaxScaler())
                ])),
                ("readingease", Pipeline([
                    ("extract", ReadingEase()),
                    ("scaler", MinMaxScaler())
                ])),
                ("gunningfog", Pipeline([
                    ("extract", GunningFog()),
                    ("scaler", MinMaxScaler())
                ]))
            ]))
        ]))
    ]))
])


feature_names = [
    "cons_density", "stress_value", "sentiment_score",
    "redundance", "unusual_words", "coherence",
    "word_count", "reading_ease", "gunning_fog"
]

In [None]:
pipeline

In [None]:
X_processed = pipeline.fit_transform(X)
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

In [None]:
X_processed_df

## With parallelism (*not working at the moment*)

In [None]:
# class InputHandler(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         if isinstance(X, str):
#             X = [X]
#         return pd.DataFrame({"text": X})

# class TextPreprocessor(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["preprocessed"]

#     def transform(self, X):
#         if isinstance(X, pd.DataFrame):
#             X = X["text"]
#         cleaned = X.apply(basic_cleaning)
#         return pd.DataFrame({"preprocessed": cleaned})

# class ConsDensity(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["cons_density"]

#     def transform(self, X):
#         return X["preprocessed"].apply(cons_density).values.reshape(-1, 1)

# class Stress(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["stress_value"]

#     def transform(self, X):
#         return X["preprocessed"].apply(get_sentence_stress).values.reshape(-1, 1)

# class Sentiment(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["sentiment_score"]

#     def transform(self, X):
#         return X["preprocessed"].apply(sentiment_polarity).values.reshape(-1, 1)

# class Redundance(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["redundance"]

#     def transform(self, X):
#         return X["preprocessed"].apply(redundance).values.reshape(-1, 1)

# class UnusualWord(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["unusual_words"]

#     def transform(self, X):
#         return X["preprocessed"].apply(word_choice).values.reshape(-1, 1)

# class Coherence(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["coherence"]

#     def transform(self, X):
#         return X["preprocessed"].apply(coherence).values.reshape(-1, 1)

# class ReadingEase(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["reading_ease"]

#     def transform(self, X):
#         return X["text"].apply(reading_ease).values.reshape(-1, 1)

# class GunningFog(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def get_feature_names_out(self, input_features=None):
#         return ["gunning_fog"]

#     def transform(self, X):
#         return X["text"].apply(gunning_fog).values.reshape(-1, 1)


In [None]:
# pipeline = Pipeline([
#     ("input_handler", InputHandler()),
#     ("union", FeatureUnion([
#          ("preprocessed_features", Pipeline([
#             ("preprocessor", TextPreprocessor()),
#             ("features", FeatureUnion([
#                 ("cons_density", ConsDensity()),
#                 ("stress_value", Stress()),
#                 ("sentiment_score", Sentiment()),
#                 ("redundance", Redundance()),
#                 ("unusualword", UnusualWord()),
#                 ("coherence", Coherence())
#             ]))
#         ])),
#         ("original_text_features", FeatureUnion([
#             ("readingease", ReadingEase()),
#             ("gunningfog", GunningFog())
#         ]))
#     ], n_jobs=-1))
# ])


# feature_names = [
#     "cons_density", "stress_value", "sentiment_score",
#     "redundance", "unusual_words", "coherence",
#     "reading_ease", "gunning_fog"
# ]

In [None]:
# pipeline

In [None]:
# X

In [None]:
# X_processed = pipeline.fit_transform(X)
# X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

In [None]:
# processed_df.shape

In [None]:
# X_processed_df = pd.DataFrame(X_processed)
# X_processed_df.head()

In [None]:
# X_processed_df.shape

In [None]:
# X_final = pd.concat([X, X_processed_df], axis=1)
# X_final.shape

# Model

In [9]:
X_processed_df = pipeline.fit_transform(X)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X_processed_df,y,train_size=0.7, random_state= 1, stratify= y)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(700, 9) (300, 9) (700,) (300,)


## Deep Learning

In [None]:
pip install tensorflow

In [130]:
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import optimizers

In [None]:
def initialize_model():
    model = models.Sequential()

    model.add(layers.Dense(256, activation="relu", input_dim=X_processed_df.shape[1]))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))


    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))


    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))


    model.add(layers.Dense(1, activation="sigmoid"))

    return model

In [141]:
model = initialize_model()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.summary()

In [142]:
def compile_model(model):

    return  model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])

In [143]:
es = EarlyStopping(patience = 20,restore_best_weights=True, monitor='val_loss')
compile_model(model)
history = model.fit(
    X_train, y_train,
    epochs=500,
    callbacks=[es],
    validation_split = 0.2
    )

Epoch 1/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.6566 - loss: 0.7168 - val_accuracy: 0.6143 - val_loss: 0.6302
Epoch 2/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7294 - loss: 0.5604 - val_accuracy: 0.6143 - val_loss: 0.6177
Epoch 3/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7207 - loss: 0.6131 - val_accuracy: 0.6143 - val_loss: 0.6116
Epoch 4/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7433 - loss: 0.4976 - val_accuracy: 0.6143 - val_loss: 0.6015
Epoch 5/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7366 - loss: 0.5164 - val_accuracy: 0.6143 - val_loss: 0.5980
Epoch 6/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7264 - loss: 0.5281 - val_accuracy: 0.6429 - val_loss: 0.5891
Epoch 7/500
[1m18/18[0m [

In [144]:
baseline = model.evaluate(X_test,y_test)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7308 - loss: 0.5077 


In [145]:
precision = baseline[1]
precision

0.75

In [None]:
from tensorflow.keras.models import save_model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import TFSMLayer

In [None]:
# model.save(filepath=r"home\romaric\code\Romaric1209\project-roma\notebooks\roma_models\baseline_model.keras")
# model = load_model(r'/home/romaric/code/Romaric1209/project-roma/notebooks/roma_models/baseline_model.keras')

## KNN Classifier

In [70]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_scores =[]
for i in range(1,100):
    knn = KNeighborsClassifier(n_neighbors=i)

    knn.fit(X_train, y_train)

    knn_score= knn.score(X_test, y_test)

    knn_scores.append(knn_score)

In [85]:
max(knn_scores)

0.77

## Decision Trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
tree_clf = RandomForestClassifier(criterion='gini',max_depth=3, random_state=0)

In [None]:
tree_clf.fit(X_train, y_train)

In [28]:
tree_clf.score(X_test,y_test)

0.78

In [None]:
tree_clf_scores=[]

for i in range(1,100):
    tree_clf = RandomForestClassifier(criterion='entropy',max_depth=i, random_state=0)
    tree_clf.fit(X_train, y_train)
    tree_clf_score= tree_clf.score(X_test,y_test)
    tree_clf_scores.append(tree_clf_score)

max(tree_clf_scores)


0.79

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_validate

tree_clf_adb = AdaBoostClassifier(
    RandomForestClassifier(criterion='gini',max_depth=3, random_state=0))

cv_results = cross_validate(tree_clf_adb, X_train, y_train, scoring = ["accuracy","roc_auc","f1"], cv=5)

In [43]:
cv_results

{'fit_time': array([10.9450767 , 12.38652587, 11.00603437, 12.85075879, 10.84119582]),
 'score_time': array([1.0050745 , 1.21296477, 1.02080965, 1.24233079, 1.04374623]),
 'test_accuracy': array([0.75      , 0.70714286, 0.72857143, 0.78571429, 0.77857143]),
 'test_roc_auc': array([0.86848635, 0.81989247, 0.82247725, 0.89371381, 0.85972193]),
 'test_f1': array([0.72      , 0.672     , 0.68852459, 0.765625  , 0.75590551])}

In [48]:
print("accuracy: " + str(cv_results["test_accuracy"].mean()))
print("ROC-AUC: " + str(cv_results["test_roc_auc"].mean()))
print("F1: " + str(cv_results["test_f1"].mean()))

accuracy: 0.75
ROC-AUC: 0.8528583652968649
F1: 0.7204110203949916


### Gradient Boosting

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
tree_gb_clf = GradientBoostingClassifier(
   n_estimators=100,
   learning_rate=0.01,
   max_depth=3
)

In [68]:
tree_gb_clf.fit(X_train,y_train)

In [69]:
tree_gb_clf.score(X_test,y_test)

0.7666666666666667

## Stacking

In [88]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [86]:
base_models = [
    ("rf", RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),  # Tuned RF
    ("knn", KNeighborsClassifier(n_neighbors=5)),  # Optimized KNN
    ("tree_gb", GradientBoostingClassifier(learning_rate=0.01, max_depth=3))
]

In [89]:
meta_model = LogisticRegression()

In [90]:
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model
)

In [91]:
param_grid = {
    'final_estimator__C': [0.1, 1, 10],  # Regularization strength for Logistic Regression
    'final_estimator__solver': ['liblinear', 'lbfgs']
}

In [92]:
grid_search = GridSearchCV(stacking_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [95]:
grid_search.fit(X_train, y_train)

In [100]:
best_stacking_model = grid_search.best_estimator_

In [103]:
y_pred = best_stacking_model.predict(X_test)

In [104]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score = accuracy_score(y_test, y_pred)
accuracy_score

0.76

In [105]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.89      0.81       168
           1       0.81      0.59      0.68       132

    accuracy                           0.76       300
   macro avg       0.77      0.74      0.75       300
weighted avg       0.77      0.76      0.75       300



# Test

In [3]:
import joblib
from roma_pipeline import InputHandler, HowManyWords, TextPreprocessor, ConsDensity, Stress, Sentiment,Redundance,UnusualWord,Coherence,ReadingEase,GunningFog
pipeline = joblib.load("roma_pipeline.joblib")

In [2]:
X = input()

In [3]:
X_new_processed = pipeline.fit_transform(X)

In [6]:
X_new_processed

array([[0.467, 0.   , 0.   , 0.   , 0.   , 1.   , 0.   , 0.   , 0.   ]])

In [4]:
from tensorflow.keras.models import load_model
model = load_model('/home/romaric/code/Romaric1209/project-roma/notebooks/roma_models/baseline_model.keras')

2025-02-28 15:28:50.294340: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-28 15:28:50.857366: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-28 15:28:51.315436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740727731.678772   59797 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740727731.797352   59797 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-28 15:28:53.269648: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [5]:
model.predict(X_new_processed)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step


array([[0.92649466]], dtype=float32)