In [1]:
import pandas
import psycopg2
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List

from sklearn.metrics import roc_auc_score

from utils.nlp.word_cut import tokenizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from getpass import getpass

conn = psycopg2.connect(
    database="resman",
    user="resman",
    password=getpass("Input db password"),
    host="172.30.0.59",
    port="5432"
)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.630 seconds.
Prefix dict has been built successfully.


In [18]:

def creating_vectorizor(titles: List[str]) -> TfidfVectorizer:
    print(f"Training from {len(titles)} titles")
    return TfidfVectorizer(
        analyzer=tokenizer,
        lowercase=False,
        # max_df=0.95,
        # min_df=3,
    ).fit(titles)

In [19]:
vectorizor = creating_vectorizor(
    pandas.read_sql("select title from public.data_imagelist", conn)["title"].tolist()
)

Training from 94511 titles


In [28]:
df = pandas.read_sql("""
SELECT
    title,
    (click_count>2 OR positive_reaction is true OR fetch_rate>0.85) as y
FROM
    public.imagelist_master
WHERE
    user_id=1 AND (impression_count>0 OR click_count>0 OR positive_reaction is true)
""", conn)

X_train, X_test, y_train, y_test = train_test_split(
    vectorizor.transform(df["title"].tolist()), df["y"].to_numpy(), test_size=0.15, random_state=0
)

[LibLinear]C=10 Test score=0.8040847613144766, train score=0.983623610872184


In [36]:
C = 1
lr_model = LogisticRegression(
    C=C, penalty='l2', solver='liblinear',class_weight="balanced",max_iter=4000,verbose=True
)
lr_model.fit(X_train, y_train)
print("C={} Test score={}, train score={}".format(
    C,
    roc_auc_score(y_test, lr_model.decision_function(X_test)),
    roc_auc_score(y_train, lr_model.decision_function(X_train))
))

[LibLinear]C=1 Test score=0.8243806509697787, train score=0.9410384238235298


In [37]:
features = sorted(
    list(zip(vectorizor.get_feature_names(), lr_model.coef_[0].tolist())),
    key=lambda x:-x[1]
)

In [38]:
features[:40]

[('坦克', 5.930476187651531),
 ('熟女', 5.625434714620802),
 ('大妈', 5.484892749963467),
 ('微胖', 5.465565022667665),
 ('大姐', 5.078154033448531),
 ('阿姨', 4.464852309224589),
 ('丰满', 4.44913596456246),
 ('肉肉', 4.0264116813277315),
 ('老', 3.9550246342201656),
 ('一线天', 3.742100126962617),
 ('农村', 3.738874294901159),
 ('丑', 3.613875226293607),
 ('胖妞', 3.4066180353127113),
 ('胖妹', 3.396289240572246),
 ('馒头', 3.3894142806356182),
 ('胖', 3.3644653112744747),
 ('无毛', 3.358567185602998),
 ('眼镜', 3.265338917049555),
 ('姐姐', 3.2558146963111856),
 ('熟妇', 2.9547282709456586),
 ('小萝莉', 2.9301110460896886),
 ('高中', 2.824696338204029),
 ('年纪', 2.808521188082216),
 ('白虎', 2.8069035629533334),
 ('肉', 2.801830791935213),
 ('肥', 2.783152392000311),
 ('肥逼', 2.714232416242113),
 ('多肉', 2.713003545643903),
 ('丰腴', 2.6662018884613783),
 ('寡妇', 2.621573615448385),
 ('肉乎乎', 2.6171295607403295),
 ('肉感', 2.6066624865297077),
 ('大嫂', 2.57252919782641),
 ('胖胖的', 2.5551623063630227),
 ('半老徐娘', 2.4918118784690857),
 ('初中',

In [27]:
features[-40:]

[('户外', -2.893643115241421),
 ('手感', -2.9428114203503997),
 ('情人', -2.9429608739239006),
 ('黑森林', -2.962968045302646),
 ('车震', -3.0319139061471354),
 ('水嫩', -3.036083041498311),
 ('女友', -3.0391792881746205),
 ('制服', -3.0582838540228523),
 ('摄影师', -3.06179240360905),
 ('东西', -3.0849451210913617),
 ('唯美', -3.086540921216233),
 ('秘书', -3.1645497779368643),
 ('黑木耳', -3.1789050887422623),
 ('美穴', -3.2019140214986574),
 ('白色', -3.277759151452974),
 ('护士', -3.29477093524725),
 ('美好', -3.302158022368162),
 ('裸体', -3.33953179080623),
 ('激情', -3.353695347583527),
 ('娇妻', -3.3559554400591187),
 ('情侣', -3.356674449658241),
 ('性感', -3.4518342143410523),
 ('黑色', -3.4678003060306595),
 ('情趣内衣', -3.4813929722885644),
 ('黑丝', -3.4960078596851054),
 ('大学生', -3.5440203048979173),
 ('骚妻', -3.5607094760498494),
 ('OL', -3.628617913086124),
 ('呀', -3.7021418264293717),
 ('空姐', -3.8433402966492425),
 ('道具', -3.8498080677921522),
 ('【', -3.856201107449009),
 ('长腿', -3.940717587787523),
 ('拍', -3.9938767274848