Method 1: Frequency-based Method

In [None]:
from collections import Counter
import re

def frequency_based_method(text, top_n=10):
    words = re.findall(r'\w+', text.lower())
    word_counts = Counter(words)
    most_common_words = word_counts.most_common(top_n)
    return most_common_words

text = "This is an example text. This text is for testing the frequency-based method."
keywords = frequency_based_method(text, top_n=5)
print(keywords)


Method 2: Mean and Variance

In [None]:
import numpy as np

def mean_variance_method(text, top_n=10):
    words = re.findall(r'\w+', text.lower())
    word_counts = Counter(words)

    counts = np.array(list(word_counts.values()))
    mean = np.mean(counts)
    variance = np.var(counts)

    important_words = {word: count for word, count in word_counts.items() if count > mean and (count - mean)**2 < variance}

    sorted_words = sorted(important_words.items(), key=lambda item: item[1], reverse=True)
    return sorted_words[:top_n]

keywords = mean_variance_method(text, top_n=5)
print(keywords)


Method 3: JKF Justeson Katz Filter

In [None]:
def justeson_katz_filter(text):
    jkf_pattern = re.compile(r'\b(?:\w+\s+){0,1}(?P<adj>\w+)\s+(?P<noun>\w+)\b')
    matches = jkf_pattern.findall(text.lower())
    term_counts = Counter([" ".join(match) for match in matches])
    return term_counts.most_common()

keywords = justeson_katz_filter(text)
print(keywords)


Combined Method

In [None]:
def combined_keyword_extraction(text, top_n=10):
    freq_keywords = frequency_based_method(text, top_n)
    mv_keywords = mean_variance_method(text, top_n)
    jkf_keywords = justeson_katz_filter(text)[:top_n]

    combined_keywords = set([word for word, _ in freq_keywords] +
                            [word for word, _ in mv_keywords] +
                            [term for term, _ in jkf_keywords])

    return combined_keywords

combined_keywords = combined_keyword_extraction(text, top_n=10)
print(combined_keywords)


Z-test

In [None]:
from scipy import stats

# 两个样本数据
sample1 = [1, 2, 3, 4, 5]
sample2 = [2, 3, 4, 5, 6]

# 计算Z值和p值
z_stat, p_value = stats.ttest_ind(sample1, sample2)

print("Z-statistic:", z_stat)
print("p-value:", p_value)


t-test

In [None]:
from scipy import stats

# 单个样本数据
sample = [1, 2, 3, 4, 5]

# 假设的均值
mu = 3

# 计算T值和p值
t_stat, p_value = stats.ttest_1samp(sample, mu)

print("T-statistic:", t_stat)
print("p-value:", p_value)


χ²-test

In [None]:
from scipy.stats import chi2_contingency

# 定义一个二维数组表示观察到的频次
observed = [[10, 20], [30, 40]]

# 计算χ²值、p值、自由度和期望频次
chi2_stat, p_value, dof, expected = chi2_contingency(observed)

print("χ²-statistic:", chi2_stat)
print("p-value:", p_value)
